HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
array_intrinsic_f64x2_x86.hpp
1// Copyright Take Vos 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "array_intrinsic.hpp"
8#include "macros.hpp"
9#include <cstddef>
10#include <array>
11#include <limits>
12
13#include <xmmintrin.h>
14#include <emmintrin.h>
15#include <pmmintrin.h>
16#include <tmmintrin.h>
17#include <smmintrin.h>
18#include <nmmintrin.h>
19#include <immintrin.h>
20
21hi_export_module(hikocpu : array_intrinsic_f32x4);
22
23hi_export namespace hi {
24inline namespace v1 {
25
26#if defined(HI_HAS_SSE2)
27template<>
28struct array_intrinsic<double, 2> {
29 using value_type = double;
30 using register_type = __m128d;
31 using array_type = std::array<double, 2>;
32
35 [[nodiscard]] hi_force_inline static register_type L(array_type a) noexcept
36 {
37 return _mm_loadu_pd(a.data());
38 }
39
42 [[nodiscard]] hi_force_inline static array_type S(register_type a) noexcept
43 {
44 auto r = array_type{};
45 _mm_storeu_pd(r.data(), a);
46 return r;
47 }
48
49 [[nodiscard]] hi_force_inline static array_type undefined() noexcept
50 {
51 return S(_mm_undefined_pd());
52 }
53
54 [[nodiscard]] hi_force_inline static array_type set(float a, float b) noexcept
55 {
56 return S(_mm_set_pd(b, a));
57 }
58
59 [[nodiscard]] hi_force_inline static array_type set(float a) noexcept
60 {
61 return S(_mm_set_pd(0.0, a));
62 }
63
64 [[nodiscard]] hi_force_inline static array_type set_zero() noexcept
65 {
66 return S(_mm_setzero_pd());
67 }
68
69 [[nodiscard]] hi_force_inline static array_type set_all_ones() noexcept
70 {
71 return S(_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())));
72 }
73
74 [[nodiscard]] hi_force_inline static array_type set_one() noexcept
75 {
76 auto const ones = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
77 return S(_mm_castsi128_pd(_mm_srli_epi64(_mm_slli_epi64(ones, 54), 2)));
78 }
79
80 template<size_t I>
81 [[nodiscard]] hi_force_inline static float get(array_type a) noexcept
82 {
83 static_assert(I < 2);
84 if constexpr (I == 0) {
85 return _mm_cvtsd_f64(L(a));
86 } else {
87 return _mm_cvtsd_f64(_mm_shuffle_pd(L(a), L(a), I));
88 }
89 }
90
91 [[nodiscard]] hi_force_inline static array_type broadcast(float a) noexcept
92 {
93 return S(_mm_set1_pd(a));
94 }
95
96 [[nodiscard]] hi_force_inline static array_type broadcast(array_type a) noexcept
97 {
98 return S(_mm_shuffle_pd(L(a), L(a), 0));
99 }
100
101 [[nodiscard]] hi_force_inline static array_type set_mask(std::size_t mask) noexcept
102 {
103 // clang-format off
104 auto const tmp = _mm_set_epi32(
105 static_cast<int32_t>(mask) << 30,
106 static_cast<int32_t>(mask) << 30,
107 static_cast<int32_t>(mask) << 31,
108 static_cast<int32_t>(mask) << 31);
109 // clang-format on
110
111 return S(_mm_castsi128_pd(_mm_srai_epi32(tmp, 31)));
112 }
113
116 [[nodiscard]] hi_force_inline static std::size_t get_mask(array_type a) noexcept
117 {
118 return _mm_movemask_pd(L(a));
119 }
120
121 [[nodiscard]] hi_force_inline static array_type neg(array_type a) noexcept
122 {
123 return S(_mm_sub_pd(_mm_setzero_pd(), L(a)));
124 }
125
126 template<std::size_t Mask>
127 [[nodiscard]] hi_force_inline constexpr static array_type neg_mask(array_type a) noexcept
128 {
129 if constexpr (Mask == 0) {
130 return a;
131 } else if constexpr (Mask == 0b11) {
132 return S(_mm_sub_pd(_mm_setzero_pd(), L(a)));
133#if defined(HI_HAS_SSE3)
134 } else if constexpr (Mask == 0b01) {
135 return S(_mm_addsub_pd(_mm_setzero_pd(), L(a)));
136#endif
137 } else {
138 auto const tmp = _mm_sub_pd(_mm_setzero_pd(), L(a));
139 return blend<Mask>(a, S(tmp));
140 }
141 }
142
143 [[nodiscard]] hi_force_inline static array_type inv(array_type a) noexcept
144 {
145 return _xor(set_all_ones(), a);
146 }
147
148 [[nodiscard]] hi_force_inline static array_type sqrt(array_type a) noexcept
149 {
150 return S(_mm_sqrt_pd(L(a)));
151 }
152
153#if defined(HI_HAS_SSE4_1)
154 [[nodiscard]] hi_force_inline static array_type round(array_type a) noexcept
155 {
156 return S(_mm_round_pd(L(a), _MM_FROUND_CUR_DIRECTION));
157 }
158
159 [[nodiscard]] hi_force_inline static array_type floor(array_type a) noexcept
160 {
161 return S(_mm_floor_pd(L(a)));
162 }
163
164 [[nodiscard]] hi_force_inline static array_type ceil(array_type a) noexcept
165 {
166 return S(_mm_ceil_pd(L(a)));
167 }
168#endif
169
170 [[nodiscard]] hi_force_inline static array_type add(array_type a, array_type b) noexcept
171 {
172 return S(_mm_add_pd(L(a), L(b)));
173 }
174
175 [[nodiscard]] hi_force_inline static array_type sub(array_type a, array_type b) noexcept
176 {
177 return S(_mm_sub_pd(L(a), L(b)));
178 }
179
180 template<std::size_t Mask>
181 [[nodiscard]] hi_force_inline constexpr static array_type addsub_mask(array_type a, array_type b) noexcept
182 {
183 if constexpr (Mask == 0) {
184 return sub(a, b);
185 } else if constexpr (Mask == 0b11) {
186 return add(a, b);
187#if defined(HI_HAS_SSE3)
188 } else if constexpr (Mask == 0b10) {
189 return S(_mm_addsub_pd(L(a), L(b)));
190#endif
191 } else {
192 return blend<Mask>(sub(a, b), add(a, b));
193 }
194 }
195
196 [[nodiscard]] hi_force_inline static array_type mul(array_type a, array_type b) noexcept
197 {
198 return S(_mm_mul_pd(L(a), L(b)));
199 }
200
201 [[nodiscard]] hi_force_inline static array_type div(array_type a, array_type b) noexcept
202 {
203 return S(_mm_div_pd(L(a), L(b)));
204 }
205
206 [[nodiscard]] hi_force_inline static array_type eq(array_type a, array_type b) noexcept
207 {
208 return S(_mm_cmpeq_pd(L(a), L(b)));
209 }
210
211 [[nodiscard]] hi_force_inline static array_type ne(array_type a, array_type b) noexcept
212 {
213 return S(_mm_cmpneq_pd(L(a), L(b)));
214 }
215
216 [[nodiscard]] hi_force_inline static array_type lt(array_type a, array_type b) noexcept
217 {
218 return S(_mm_cmplt_pd(L(a), L(b)));
219 }
220
221 [[nodiscard]] hi_force_inline static array_type gt(array_type a, array_type b) noexcept
222 {
223 return S(_mm_cmpgt_pd(L(a), L(b)));
224 }
225
226 [[nodiscard]] hi_force_inline static array_type le(array_type a, array_type b) noexcept
227 {
228 return S(_mm_cmple_pd(L(a), L(b)));
229 }
230
231 [[nodiscard]] hi_force_inline static array_type ge(array_type a, array_type b) noexcept
232 {
233 return S(_mm_cmpge_pd(L(a), L(b)));
234 }
235
236 [[nodiscard]] hi_force_inline static bool test(array_type a, array_type b) noexcept
237 {
238#if defined(HI_HAS_SSE4_1)
239 return static_cast<bool>(_mm_testz_si128(_mm_castpd_si128(L(a)), _mm_castpd_si128(L(b))));
240#else
241 return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_castpd_si128(_mm_and_pd(L(a), L(b))), _mm_setzero_si128())) == 0xffff;
242#endif
243 }
244
245 [[nodiscard]] hi_force_inline static array_type max(array_type a, array_type b) noexcept
246 {
247 return S(_mm_max_pd(L(a), L(b)));
248 }
249
250 [[nodiscard]] hi_force_inline static array_type min(array_type a, array_type b) noexcept
251 {
252 return S(_mm_min_pd(L(a), L(b)));
253 }
254
255 [[nodiscard]] hi_force_inline static array_type clamp(array_type v, array_type lo, array_type hi) noexcept
256 {
257 return S(_mm_min_pd(_mm_max_pd(L(v), L(lo)), L(hi)));
258 }
259
260 [[nodiscard]] hi_force_inline static array_type _or(array_type a, array_type b) noexcept
261 {
262 return S(_mm_or_pd(L(a), L(b)));
263 }
264
265 [[nodiscard]] hi_force_inline static array_type _and(array_type a, array_type b) noexcept
266 {
267 return S(_mm_and_pd(L(a), L(b)));
268 }
269
270 [[nodiscard]] hi_force_inline static array_type _xor(array_type a, array_type b) noexcept
271 {
272 return S(_mm_xor_pd(L(a), L(b)));
273 }
274
275 [[nodiscard]] hi_force_inline static array_type andnot(array_type a, array_type b) noexcept
276 {
277 return S(_mm_andnot_pd(L(a), L(b)));
278 }
279
280 [[nodiscard]] hi_force_inline static array_type sll(array_type a, unsigned int b) noexcept
281 {
282 auto const b_ = _mm_set_epi32(0, 0, 0, b);
283 return S(_mm_castsi128_pd(_mm_sll_epi64(_mm_castpd_si128(L(a)), b_)));
284 }
285
286 [[nodiscard]] hi_force_inline static array_type srl(array_type a, unsigned int b) noexcept
287 {
288 auto const b_ = _mm_set_epi32(0, 0, 0, b);
289 return S(_mm_castsi128_pd(_mm_srl_epi64(_mm_castpd_si128(L(a)), b_)));
290 }
291
292 [[nodiscard]] hi_force_inline static array_type sra(array_type a, unsigned int b) noexcept
293 {
294 auto const b_ = _mm_set_epi32(0, 0, 0, b);
295 return S(_mm_castsi128_pd(_mm_sra_epi64(_mm_castpd_si128(L(a)), b_)));
296 }
297
298#if defined(HI_HAS_SSE3)
299 [[nodiscard]] hi_force_inline static array_type hadd(array_type a, array_type b) noexcept
300 {
301 return S(_mm_hadd_pd(L(a), L(b)));
302 }
303#endif
304
305#if defined(HI_HAS_SSE3)
306 [[nodiscard]] hi_force_inline static array_type hsub(array_type a, array_type b) noexcept
307 {
308 return S(_mm_hsub_pd(L(a), L(b)));
309 }
310#endif
311
312 template<int... Indices>
313 [[nodiscard]] constexpr static unsigned int _make_indices_imm() noexcept
314 {
315 static_assert(sizeof...(Indices) == 2);
316
317 constexpr auto indices = std::array{Indices...};
318 auto r = 0U;
319 for (size_t i = 0; i != 2; ++i) {
320 auto const index = indices[i] < 0 ? i : indices[i];
321 r |= index << (i * 2);
322 }
323 return r;
324 }
325
326 template<int... Indices>
327 [[nodiscard]] hi_force_inline static array_type shuffle(array_type a) noexcept
328 {
329 return S(_mm_shuffle_pd(L(a), L(a), _make_indices_imm<Indices...>()));
330 }
331
332 template<size_t Mask>
333 [[nodiscard]] hi_force_inline static array_type blend(array_type a, array_type b) noexcept
334 {
335#if defined(HI_HAS_SSE4_1)
336 return S(_mm_blend_pd(L(a), L(b), Mask));
337#else
338 auto const lo = _mm_unpacklo_pd(L(a), L(b));
339 auto const hi = _mm_unpackhi_pd(L(a), L(b));
340 return S(_mm_shuffle_pd(lo, hi, Mask));
341#endif
342 }
343
344 [[nodiscard]] hi_force_inline static array_type sum(array_type a) noexcept
345 {
346 auto const tmp = _mm_shuffle_pd(L(a), L(a), 0b01);
347 return S(_mm_add_pd(L(a), tmp));
348 }
349
350 template<size_t Mask>
351 [[nodiscard]] hi_force_inline static array_type dot(array_type a, array_type b) noexcept
352 {
353#if defined(HI_HAS_SSE4_1)
354 return S(_mm_dp_pd(L(a), L(b), (Mask << 2) | 0b11));
355#else
356 auto const multiplied = blend<Mask>(set_zero(), mul(a, b));
357 return sum(multiplied);
358#endif
359 }
360};
361#endif
362
363} // namespace v1
364} // namespace v1
@ round
The end cap of the line is round.
The HikoGUI namespace.
Definition array_generic.hpp:20
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
T ceil(T... args)
T div(T... args)
T floor(T... args)
T shuffle(T... args)