HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
swizzle_avx.hpp
1// Copyright Take Vos 2021-2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../utility.hpp"
8#include "../architecture.hpp"
9#if defined(HI_HAS_AVX)
10
11#include <emmintrin.h>
12#include <smmintrin.h>
13#include <xmmintrin.h>
14#include <pmmintrin.h>
15#include <immintrin.h>
16
17hi_warning_push();
18// C26818: Switch statement does not cover all cases. Consider adding a 'default' label (es.79).
19// False positive.
20hi_warning_ignore_msvc(26818);
21
22namespace hi { inline namespace v1 {
23
24template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
25[[nodiscard]] constexpr static int _mm_swizzle_ps_permute_mask() noexcept
26{
27 static_assert(A >= -3 and A < 4);
28 static_assert(B >= -3 and B < 4);
29 static_assert(C >= -3 and C < 4);
30 static_assert(D >= -3 and D < 4);
31
32 int r = 0;
33 switch (A) {
34 case 0:
35 r |= 0b00'00'00'00;
36 break;
37 case 1:
38 r |= 0b00'00'00'01;
39 break;
40 case 2:
41 r |= 0b00'00'00'10;
42 break;
43 case 3:
44 r |= 0b00'00'00'11;
45 break;
46 case -1:
47 r |= 0b00'00'00'00;
48 break;
49 case -2:
50 r |= 0b00'00'00'00;
51 break;
52 }
53 switch (B) {
54 case 0:
55 r |= 0b00'00'00'00;
56 break;
57 case 1:
58 r |= 0b00'00'01'00;
59 break;
60 case 2:
61 r |= 0b00'00'10'00;
62 break;
63 case 3:
64 r |= 0b00'00'11'00;
65 break;
66 case -1:
67 r |= 0b00'00'01'00;
68 break;
69 case -2:
70 r |= 0b00'00'01'00;
71 break;
72 }
73 switch (C) {
74 case 0:
75 r |= 0b00'00'00'00;
76 break;
77 case 1:
78 r |= 0b00'01'00'00;
79 break;
80 case 2:
81 r |= 0b00'10'00'00;
82 break;
83 case 3:
84 r |= 0b00'11'00'00;
85 break;
86 case -1:
87 r |= 0b00'10'00'00;
88 break;
89 case -2:
90 r |= 0b00'10'00'00;
91 break;
92 }
93 switch (D) {
94 case 0:
95 r |= 0b00'00'00'00;
96 break;
97 case 1:
98 r |= 0b01'00'00'00;
99 break;
100 case 2:
101 r |= 0b10'00'00'00;
102 break;
103 case 3:
104 r |= 0b11'00'00'00;
105 break;
106 case -1:
107 r |= 0b11'00'00'00;
108 break;
109 case -2:
110 r |= 0b11'00'00'00;
111 break;
112 }
113 return r;
114}
115
116template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
117[[nodiscard]] constexpr static int _mm_swizzle_ps_not_one_mask() noexcept
118{
119 static_assert(A >= -3 && A < 4);
120 static_assert(B >= -3 && B < 4);
121 static_assert(C >= -3 && C < 4);
122 static_assert(D >= -3 && D < 4);
123
124 int r = 0;
125 r |= (A == -2) ? 0 : 0b0001;
126 r |= (B == -2) ? 0 : 0b0010;
127 r |= (C == -2) ? 0 : 0b0100;
128 r |= (D == -2) ? 0 : 0b1000;
129 return r;
130}
131
132template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
133[[nodiscard]] constexpr static int _mm_swizzle_ps_number_mask() noexcept
134{
135 static_assert(A >= -3 && A < 4);
136 static_assert(B >= -3 && B < 4);
137 static_assert(C >= -3 && C < 4);
138 static_assert(D >= -3 && D < 4);
139
140 int r = 0;
141 r |= A < 0 ? 0b0001 : 0;
142 r |= B < 0 ? 0b0010 : 0;
143 r |= C < 0 ? 0b0100 : 0;
144 r |= D < 0 ? 0b1000 : 0;
145 return r;
146}
147
148template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
149[[nodiscard]] __m128 _mm_swizzle_ps(__m128 const& value) noexcept
150{
151 static_assert(A >= -3 && A < 4);
152 static_assert(B >= -3 && B < 4);
153 static_assert(C >= -3 && C < 4);
154 static_assert(D >= -3 && D < 4);
155
156 constexpr int permute_mask = _mm_swizzle_ps_permute_mask<A, B, C, D>();
157 constexpr int not_one_mask = _mm_swizzle_ps_not_one_mask<A, B, C, D>();
158 constexpr int number_mask = _mm_swizzle_ps_number_mask<A, B, C, D>();
159
160 hilet swizzled = [&] {
161 // Clang is able to optimize these intrinsics, MSVC is not.
162 if constexpr (permute_mask != 0b11'10'01'00) {
163 return _mm_permute_ps(value, permute_mask);
164 } else {
165 return value;
166 }
167 }();
168
169 hilet numbers = [&] {
170 if constexpr (not_one_mask == 0b0000) {
171 return _mm_set_ps1(1.0f);
172 } else if constexpr (not_one_mask == 0b1111) {
173 return _mm_setzero_ps();
174 } else if constexpr (not_one_mask == 0b1110) {
175 return _mm_set_ss(1.0f);
176 } else {
177 hilet _1111 = _mm_set_ps1(1.0f);
178 return _mm_insert_ps(_1111, _1111, not_one_mask);
179 }
180 }();
181
182 if constexpr (number_mask == 0b0000) {
183 return swizzled;
184 } else if constexpr (number_mask == 0b1111) {
185 return numbers;
186 } else if constexpr (((not_one_mask | ~number_mask) & 0b1111) == 0b1111) {
187 return _mm_insert_ps(swizzled, swizzled, number_mask);
188 } else {
189 return _mm_blend_ps(swizzled, numbers, number_mask);
190 }
191}
192
193template<ssize_t A, ssize_t B, ssize_t C, ssize_t D>
194[[nodiscard]] __m128i _mm_swizzle_epi32(__m128i const& value) noexcept
195{
196 static_assert(A >= -3 && A < 4);
197 static_assert(B >= -3 && B < 4);
198 static_assert(C >= -3 && C < 4);
199 static_assert(D >= -3 && D < 4);
200
201 constexpr int permute_mask = _mm_swizzle_ps_permute_mask<A, B, C, D>();
202 constexpr int not_one_mask = _mm_swizzle_ps_not_one_mask<A, B, C, D>();
203 constexpr int number_mask = _mm_swizzle_ps_number_mask<A, B, C, D>();
204
205 hilet swizzled = [&] {
206 // Clang is able to optimize these intrinsics, MSVC is not.
207 if constexpr (permute_mask != 0b11'10'01'00) {
208 return _mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(value), permute_mask));
209 } else {
210 return value;
211 }
212 }();
213
214 hilet numbers = [&] {
215 if constexpr (not_one_mask == 0b0000) {
216 return _mm_set1_epi32(1);
217 } else if constexpr (not_one_mask == 0b1111) {
218 return _mm_setzero_si128();
219 } else {
220 hilet _1111 = _mm_castsi128_ps(_mm_set1_epi32(1));
221 return _mm_castps_si128(_mm_insert_ps(_1111, _1111, not_one_mask));
222 }
223 }();
224
225 if constexpr (number_mask == 0b0000) {
226 return swizzled;
227 } else if constexpr (number_mask == 0b1111) {
228 return numbers;
229 } else if constexpr (((not_one_mask | ~number_mask) & 0b1111) == 0b1111) {
230 return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(swizzled), _mm_castsi128_ps(swizzled), number_mask));
231 } else {
232 return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(swizzled), _mm_castsi128_ps(numbers), number_mask));
233 }
234}
235
236template<ssize_t A = -1, ssize_t B = -1>
237[[nodiscard]] __m128d _mm_swizzle_pd(__m128d const& value) noexcept
238{
239 constexpr auto A1 = A >= 0 ? A * 2 : A;
240 constexpr auto A2 = A >= 0 ? A1 + 1 : A1;
241 constexpr auto B1 = B >= 0 ? B * 2 : B;
242 constexpr auto B2 = B >= 0 ? B1 + 1 : B1;
243
244 return _mm_castps_pd(_mm_swizzle_ps<A1, A2, B1, B2>(_mm_castpd_ps(value)));
245}
246
247template<ssize_t A = -1, ssize_t B = -1>
248[[nodiscard]] __m128i _mm_swizzle_epi64(__m128i const& value) noexcept
249{
250 return _mm_castpd_si128(_mm_swizzle_pd<A, B>(_mm_castsi128_pd(value)));
251}
252}} // namespace hi::v1
253
254hi_warning_pop();
255
256#endif
Utilities used by the HikoGUI library itself.
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
Functions and macros for handling architectural difference between compilers, CPUs and operating syst...
DOXYGEN BUG.
Definition algorithm.hpp:15
geometry/margins.hpp
Definition assert.hpp:18
std::ptrdiff_t ssize_t
Signed size/index into an array.
Definition utility.hpp:173