HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
swizzle_avx.hpp
1// Copyright Take Vos 2020-2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../architecture.hpp"
8#if defined(HI_HAS_AVX)
9
10#include <emmintrin.h>
11#include <smmintrin.h>
12#include <xmmintrin.h>
13#include <pmmintrin.h>
14#include <immintrin.h>
15
16namespace hi::inline v1 {
17
18template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
19[[nodiscard]] constexpr static int _mm_swizzle_ps_permute_mask() noexcept
20{
21 static_assert(A >= -3 && A < 4);
22 static_assert(B >= -3 && B < 4);
23 static_assert(C >= -3 && C < 4);
24 static_assert(D >= -3 && D < 4);
25
26 int r = 0;
27 switch (A) {
28 case 0: r |= 0b00'00'00'00; break;
29 case 1: r |= 0b00'00'00'01; break;
30 case 2: r |= 0b00'00'00'10; break;
31 case 3: r |= 0b00'00'00'11; break;
32 case -1: r |= 0b00'00'00'00; break;
33 case -2: r |= 0b00'00'00'00; break;
34 }
35 switch (B) {
36 case 0: r |= 0b00'00'00'00; break;
37 case 1: r |= 0b00'00'01'00; break;
38 case 2: r |= 0b00'00'10'00; break;
39 case 3: r |= 0b00'00'11'00; break;
40 case -1: r |= 0b00'00'01'00; break;
41 case -2: r |= 0b00'00'01'00; break;
42 }
43 switch (C) {
44 case 0: r |= 0b00'00'00'00; break;
45 case 1: r |= 0b00'01'00'00; break;
46 case 2: r |= 0b00'10'00'00; break;
47 case 3: r |= 0b00'11'00'00; break;
48 case -1: r |= 0b00'10'00'00; break;
49 case -2: r |= 0b00'10'00'00; break;
50 }
51 switch (D) {
52 case 0: r |= 0b00'00'00'00; break;
53 case 1: r |= 0b01'00'00'00; break;
54 case 2: r |= 0b10'00'00'00; break;
55 case 3: r |= 0b11'00'00'00; break;
56 case -1: r |= 0b11'00'00'00; break;
57 case -2: r |= 0b11'00'00'00; break;
58 }
59 return r;
60}
61
62template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
63[[nodiscard]] constexpr static int _mm_swizzle_ps_not_one_mask() noexcept
64{
65 static_assert(A >= -3 && A < 4);
66 static_assert(B >= -3 && B < 4);
67 static_assert(C >= -3 && C < 4);
68 static_assert(D >= -3 && D < 4);
69
70 int r = 0;
71 r |= (A == -2) ? 0 : 0b0001;
72 r |= (B == -2) ? 0 : 0b0010;
73 r |= (C == -2) ? 0 : 0b0100;
74 r |= (D == -2) ? 0 : 0b1000;
75 return r;
76}
77
78template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
79[[nodiscard]] constexpr static int _mm_swizzle_ps_number_mask() noexcept
80{
81 static_assert(A >= -3 && A < 4);
82 static_assert(B >= -3 && B < 4);
83 static_assert(C >= -3 && C < 4);
84 static_assert(D >= -3 && D < 4);
85
86 int r = 0;
87 r |= A < 0 ? 0b0001 : 0;
88 r |= B < 0 ? 0b0010 : 0;
89 r |= C < 0 ? 0b0100 : 0;
90 r |= D < 0 ? 0b1000 : 0;
91 return r;
92}
93
94template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
95[[nodiscard]] __m128 _mm_swizzle_ps(__m128 const &value) noexcept
96{
97 static_assert(A >= -3 && A < 4);
98 static_assert(B >= -3 && B < 4);
99 static_assert(C >= -3 && C < 4);
100 static_assert(D >= -3 && D < 4);
101
102 constexpr int permute_mask = _mm_swizzle_ps_permute_mask<A, B, C, D>();
103 constexpr int not_one_mask = _mm_swizzle_ps_not_one_mask<A, B, C, D>();
104 constexpr int number_mask = _mm_swizzle_ps_number_mask<A, B, C, D>();
105
106 __m128 swizzled;
107 // Clang is able to optimize these intrinsics, MSVC is not.
108 if constexpr (permute_mask != 0b11'10'01'00) {
109 swizzled = _mm_permute_ps(value, permute_mask);
110 } else {
111 swizzled = value;
112 }
113
114 __m128 numbers;
115 if constexpr (not_one_mask == 0b0000) {
116 numbers = _mm_set_ps1(1.0f);
117 } else if constexpr (not_one_mask == 0b1111) {
118 numbers = _mm_setzero_ps();
119 } else if constexpr (not_one_mask == 0b1110) {
120 numbers = _mm_set_ss(1.0f);
121 } else {
122 hilet _1111 = _mm_set_ps1(1.0f);
123 numbers = _mm_insert_ps(_1111, _1111, not_one_mask);
124 }
125
126 __m128 result;
127 if constexpr (number_mask == 0b0000) {
128 result = swizzled;
129 } else if constexpr (number_mask == 0b1111) {
130 result = numbers;
131 } else if constexpr (((not_one_mask | ~number_mask) & 0b1111) == 0b1111) {
132 result = _mm_insert_ps(swizzled, swizzled, number_mask);
133 } else {
134 result = _mm_blend_ps(swizzled, numbers, number_mask);
135 }
136 return result;
137}
138
139template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
140[[nodiscard]] __m128i _mm_swizzle_epi32(__m128i const &value) noexcept
141{
142 return _mm_castps_si128(_mm_swizzle_ps<A, B, C, D>(_mm_castsi128_ps(value)));
143}
144
145template<ssize_t A = -1, ssize_t B = -1>
146[[nodiscard]] __m128d _mm_swizzle_pd(__m128d const &value) noexcept
147{
148 constexpr auto A1 = A >= 0 ? A * 2 : A;
149 constexpr auto A2 = A >= 0 ? A1 + 1 : A1;
150 constexpr auto B1 = B >= 0 ? B * 2 : B;
151 constexpr auto B2 = B >= 0 ? B1 + 1 : B1;
152
153 return _mm_castps_pd(_mm_swizzle_ps<A1, A2, B1, B2>(_mm_castpd_ps(value)));
154}
155
156template<ssize_t A = -1, ssize_t B = -1>
157[[nodiscard]] __m128i _mm_swizzle_epi64(__m128i const &value) noexcept
158{
159 return _mm_castpd_si128(_mm_swizzle_pd<A, B>(_mm_castsi128_pd(value)));
160}
161
162} // namespace hi::inline v1
163
164#endif
std::ptrdiff_t ssize_t
Signed size/index into an array.
Definition required.hpp:37
#define hilet
Invariant should be the default for variables.
Definition required.hpp:23
Functions and macros for handling architectural difference between compilers, CPUs and operating syst...