HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
array_intrinsic_f64x4_x86.hpp
1// Copyright Take Vos 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "array_intrinsic.hpp"
8#include "../macros.hpp"
9#include <cstddef>
10#include <array>
11#include <limits>
12
13#include <xmmintrin.h>
14#include <emmintrin.h>
15#include <pmmintrin.h>
16#include <tmmintrin.h>
17#include <smmintrin.h>
18#include <nmmintrin.h>
19#include <immintrin.h>
20
21hi_export_module(hikogui.SIMD.array_intrinsic_f32x4);
22
23hi_export namespace hi {
24inline namespace v1 {
25
26#if defined(HI_HAS_AVX)
27template<>
28struct array_intrinsic<double, 4> {
29 using value_type = double;
30 using register_type = __m256d;
31 using array_type = std::array<double, 4>;
32
35 [[nodiscard]] hi_force_inline static register_type L(array_type a) noexcept
36 {
37 return _mm256_loadu_pd(a.data());
38 }
39
42 [[nodiscard]] hi_force_inline static array_type S(register_type a) noexcept
43 {
44 auto r = array_type{};
45 _mm256_storeu_pd(r.data(), a);
46 return r;
47 }
48
49 [[nodiscard]] hi_force_inline static array_type undefined() noexcept
50 {
51 return S(_mm256_undefined_pd());
52 }
53
54 [[nodiscard]] hi_force_inline static array_type set(double a, double b, double c, double d) noexcept
55 {
56 return S(_mm256_set_pd(d, c, b, a));
57 }
58
59 [[nodiscard]] hi_force_inline static array_type set(double a) noexcept
60 {
61 return S(_mm256_set_pd(0.0, 0.0, 0.0, a));
62 }
63
64 [[nodiscard]] hi_force_inline static array_type set_zero() noexcept
65 {
66 return S(_mm256_setzero_pd());
67 }
68
69 [[nodiscard]] hi_force_inline static array_type set_all_ones() noexcept
70 {
71#if defined(HI_HAS_AVX2)
73#else
75#endif
76 }
77
78 [[nodiscard]] hi_force_inline static array_type set_one() noexcept
79 {
80#if defined(HI_HAS_AVX2)
83#else
84 return S(_mm256_set1_pd(1.0f));
85#endif
86 }
87
88 template<size_t I>
89 [[nodiscard]] hi_force_inline static double get(array_type a) noexcept
90 {
91 static_assert(I < 4);
92
93 if constexpr (I == 0) {
94 return _mm256_cvtsd_f64(L(a));
95 } else constexpr (I == 1) {
96 return _mm256_cvtsd_f64(_mm256_shuffle_pd(L(a), L(a), 0b1);
97 } else {
98 auto const tmp = _mm256_extractf128_pd(L(a), 0b1);
99 if constexpr (I == 2) {
100 return _mm_cvtsd_f64(tmp);
101 } else {
102 return _mm_cvtsd_f64(_mm_permute_pd(tmp, 0b1));
103 }
104 }
105 }
106
107 [[nodiscard]] hi_force_inline static array_type broadcast(double a) noexcept
108 {
109 return S(_mm256_set1_pd(a));
110 }
111
112 [[nodiscard]] hi_force_inline static array_type broadcast(array_type a) noexcept
113 {
114 auto tmp = L(a);
115 auto lo = _mm256_extractf128_pd(tmp, 0b0);
117 return S(_mm256_permute_pd(tmp, 0b0000));
118 }
119
122 [[nodiscard]] hi_force_inline static std::size_t get_mask(array_type a) noexcept
123 {
124 return _mm256_movemask_pd(L(a));
125 }
126
127 [[nodiscard]] hi_force_inline static array_type neg(array_type a) noexcept
128 {
129 return S(_mm256_sub_pd(_mm256_setzero_pd(), L(a)));
130 }
131
132 template<std::size_t Mask>
133 [[nodiscard]] hi_force_inline constexpr static array_type neg_mask(array_type a) noexcept
134 {
135 if constexpr (Mask == 0) {
136 return a;
137 } else if constexpr (Mask == 0b1111) {
138 return S(_mm256_sub_pd(_mm256_setzero_pd(), L(a)));
139#if defined(HI_HAS_SSE3)
140 } else if constexpr (Mask == 0b0101) {
141 return S(_mm256_addsub_pd(_mm256_setzero_pd(), L(a)));
142#endif
143 } else {
144 auto const tmp = _mm256_sub_pd(_mm256_setzero_pd(), L(a));
145 return blend<Mask>(a, S(tmp));
146 }
147 }
148
149 [[nodiscard]] hi_force_inline static array_type inv(array_type a) noexcept
150 {
151 return _xor(set_all_ones(), a);
152 }
153
154 [[nodiscard]] hi_force_inline static array_type rcp(array_type a) noexcept
155 {
156 return S(_mm256_rcp_pd(L(a)));
157 }
158
159 [[nodiscard]] hi_force_inline static array_type sqrt(array_type a) noexcept
160 {
161 return S(_mm256_sqrt_pd(L(a)));
162 }
163
164 [[nodiscard]] hi_force_inline static array_type rsqrt(array_type a) noexcept
165 {
166 return S(_mm256_rsqrt_pd(L(a)));
167 }
168
169#if defined(HI_HAS_SSE2)
170 [[nodiscard]] hi_force_inline static array_type round(array_type a) noexcept
171 {
172#if defined(HI_HAS_SSE4_1)
174#else
175 auto const a_ = L(a);
180
182 auto const good_a = _mm256_andnot_pd(check_bounds, a_);
183 return S(_mm256_or_pd(good_rounded, good_a));
184#endif
185 }
186#endif
187
188#if defined(HI_HAS_SSE4_1)
189 [[nodiscard]] hi_force_inline static array_type floor(array_type a) noexcept
190 {
191 return S(_mm256_floor_pd(L(a)));
192 }
193
194 [[nodiscard]] hi_force_inline static array_type ceil(array_type a) noexcept
195 {
196 return S(_mm256_ceil_pd(L(a)));
197 }
198#endif
199
200 [[nodiscard]] hi_force_inline static array_type add(array_type a, array_type b) noexcept
201 {
202 return S(_mm256_add_pd(L(a), L(b)));
203 }
204
205 [[nodiscard]] hi_force_inline static array_type sub(array_type a, array_type b) noexcept
206 {
207 return S(_mm256_sub_pd(L(a), L(b)));
208 }
209
210 template<std::size_t Mask>
211 [[nodiscard]] hi_force_inline constexpr static array_type addsub_mask(array_type a, array_type b) noexcept
212 {
213 if constexpr (Mask == 0) {
214 return sub(a, b);
215 } else if constexpr (Mask == 0b1111) {
216 return add(a, b);
217#if defined(HI_HAS_SSE3)
218 } else if constexpr (Mask == 0b1010) {
219 return S(_mm256_addsub_pd(L(a), L(b)));
220#endif
221 } else {
222 return blend<Mask>(sub(a, b), add(a, b));
223 }
224 }
225
226 [[nodiscard]] hi_force_inline static array_type mul(array_type a, array_type b) noexcept
227 {
228 return S(_mm256_mul_pd(L(a), L(b)));
229 }
230
231 [[nodiscard]] hi_force_inline static array_type div(array_type a, array_type b) noexcept
232 {
233 return S(_mm256_div_pd(L(a), L(b)));
234 }
235
236 [[nodiscard]] hi_force_inline static array_type eq(array_type a, array_type b) noexcept
237 {
238 return S(_mm256_cmpeq_pd(L(a), L(b)));
239 }
240
241 [[nodiscard]] hi_force_inline static array_type ne(array_type a, array_type b) noexcept
242 {
243 return S(_mm256_cmpneq_pd(L(a), L(b)));
244 }
245
246 [[nodiscard]] hi_force_inline static array_type lt(array_type a, array_type b) noexcept
247 {
248 return S(_mm256_cmplt_pd(L(a), L(b)));
249 }
250
251 [[nodiscard]] hi_force_inline static array_type gt(array_type a, array_type b) noexcept
252 {
253 return S(_mm256_cmpgt_pd(L(a), L(b)));
254 }
255
256 [[nodiscard]] hi_force_inline static array_type le(array_type a, array_type b) noexcept
257 {
258 return S(_mm256_cmple_pd(L(a), L(b)));
259 }
260
261 [[nodiscard]] hi_force_inline static array_type ge(array_type a, array_type b) noexcept
262 {
263 return S(_mm256_cmpge_pd(L(a), L(b)));
264 }
265
266 [[nodiscard]] hi_force_inline static bool test(array_type a, array_type b) noexcept
267 {
268#if defined(HI_HAS_SSE4_1)
269 return static_cast<bool>(_mm256_testz_si128(_mm256_castps_si128(L(a)), _mm256_castps_si128(L(b))));
270#elif defined(HI_HAS_SSE2)
272#else
273 auto tmp = std::array<float, 4>{};
274 _mm256_store_pd(tmp.data(), _mm256_and_pd(L(a), L(b)));
275
276 return (std::bit_cast<uint32_t>(std::get<0>(tmp)) | std::bit_cast<uint32_t>(std::get<1>(tmp)) |
277 std::bit_cast<uint32_t>(std::get<2>(tmp)) | std::bit_cast<uint32_t>(std::get<3>(tmp))) == 0;
278#endif
279 }
280
281 [[nodiscard]] hi_force_inline static array_type max(array_type a, array_type b) noexcept
282 {
283 return S(_mm256_max_pd(L(a), L(b)));
284 }
285
286 [[nodiscard]] hi_force_inline static array_type min(array_type a, array_type b) noexcept
287 {
288 return S(_mm256_min_pd(L(a), L(b)));
289 }
290
291 [[nodiscard]] hi_force_inline static array_type clamp(array_type v, array_type lo, array_type hi) noexcept
292 {
293 return S(_mm256_min_pd(_mm256_max_pd(L(v), L(lo)), L(hi)));
294 }
295
296 [[nodiscard]] hi_force_inline static array_type _or(array_type a, array_type b) noexcept
297 {
298 return S(_mm256_or_pd(L(a), L(b)));
299 }
300
301 [[nodiscard]] hi_force_inline static array_type _and(array_type a, array_type b) noexcept
302 {
303 return S(_mm256_and_pd(L(a), L(b)));
304 }
305
306 [[nodiscard]] hi_force_inline static array_type _xor(array_type a, array_type b) noexcept
307 {
308 return S(_mm256_xor_pd(L(a), L(b)));
309 }
310
311 [[nodiscard]] hi_force_inline static array_type andnot(array_type a, array_type b) noexcept
312 {
313 return S(_mm256_andnot_pd(L(a), L(b)));
314 }
315
316#if defined(HI_HAS_SSE2)
317 [[nodiscard]] hi_force_inline static array_type sll(array_type a, unsigned int b) noexcept
318 {
319 auto const b_ = _mm256_set_epi32(0, 0, 0, b);
321 }
322#endif
323
324#if defined(HI_HAS_SSE2)
325 [[nodiscard]] hi_force_inline static array_type srl(array_type a, unsigned int b) noexcept
326 {
327 auto const b_ = _mm256_set_epi32(0, 0, 0, b);
329 }
330#endif
331
332#if defined(HI_HAS_SSE2)
333 [[nodiscard]] hi_force_inline static array_type sra(array_type a, unsigned int b) noexcept
334 {
335 auto const b_ = _mm256_set_epi32(0, 0, 0, b);
337 }
338#endif
339
340 [[nodiscard]] hi_force_inline static array_type hadd(array_type a, array_type b) noexcept
341 {
342#if defined(HI_HAS_SSE3)
343 return S(_mm256_hadd_pd(L(a), L(b)));
344#else
345 auto const a_ = L(a);
346 auto const b_ = L(b);
347 auto const tmp1 = _mm256_shuffle_pd(a_, b_, 0b10'00'10'00);
348 auto const tmp2 = _mm256_shuffle_pd(a_, b_, 0b11'01'11'01);
349 return S(_mm256_add_pd(tmp1, tmp2));
350#endif
351 }
352
353 [[nodiscard]] hi_force_inline static array_type hsub(array_type a, array_type b) noexcept
354 {
355#if defined(HI_HAS_SSE3)
356 return S(_mm256_hsub_pd(L(a), L(b)));
357#else
358 auto const a_ = L(a);
359 auto const b_ = L(b);
360 auto const tmp1 = _mm256_shuffle_pd(a_, b_, 0b10'00'10'00);
361 auto const tmp2 = _mm256_shuffle_pd(a_, b_, 0b11'01'11'01);
362 return S(_mm256_sub_pd(tmp1, tmp2));
363#endif
364 }
365
366 template<int... Indices>
367 [[nodiscard]] constexpr static unsigned int _make_indices_imm() noexcept
368 {
369 static_assert(sizeof...(Indices) == 4);
370
371 constexpr auto indices = std::array{Indices...};
372 auto r = 0U;
373 for (size_t i = 0; i != 4; ++i) {
374 auto const index = indices[i] < 0 ? i : indices[i];
375 r |= index << (i * 2);
376 }
377 return r;
378 }
379
380 template<int... Indices>
381 [[nodiscard]] hi_force_inline static array_type shuffle(array_type a) noexcept
382 {
383 return S(_mm256_shuffle_pd(L(a), L(a), _make_indices_imm<Indices...>()));
384 }
385
386 template<size_t Mask>
387 [[nodiscard]] hi_force_inline static array_type blend(array_type a, array_type b) noexcept
388 {
389#if defined(HI_HAS_SSE4_1)
390 return S(_mm256_blend_pd(L(a), L(b), Mask));
391#else
392 auto const lo = _mm256_unpacklo_pd(L(a), L(b));
393 auto const hi = _mm256_unpackhi_pd(L(a), L(b));
394 // clang-format off
395 constexpr auto indices =
396 (Mask & 0b0001 ? 0b00'00'00'01U : 0b00'00'00'00U) |
397 (Mask & 0b0010 ? 0b00'00'11'00U : 0b00'00'10'00U) |
398 (Mask & 0b0100 ? 0b00'01'00'00U : 0b00'00'00'00U) |
399 (Mask & 0b1000 ? 0b11'00'00'00U : 0b10'00'00'00U);
400 // clang-format on
401 return S(_mm256_shuffle_pd(lo, hi, indices));
402#endif
403 }
404
405 [[nodiscard]] hi_force_inline static std::array<array_type, 4> transpose(array_type a, array_type b, array_type c, array_type d)
406 {
407 auto a_ = L(a);
408 auto b_ = L(b);
409 auto c_ = L(c);
410 auto d_ = L(d);
412 return {S(a_), S(b_), S(c_), S(d_)};
413 }
414
415 [[nodiscard]] hi_force_inline static array_type sum(array_type a) noexcept
416 {
417 auto const x_y_z_w = L(a);
418 auto const y_x_w_z = _mm256_shuffle_pd(x_y_z_w, x_y_z_w, 0b10'11'00'01);
423 }
424
425 template<size_t Mask>
426 [[nodiscard]] hi_force_inline static array_type dot(array_type a, array_type b) noexcept
427 {
428#if defined(HI_HAS_SSE4_1)
429 return S(_mm256_dp_pd(L(a), L(b), (Mask << 4) | 0b1111));
430#else
431 auto const multiplied = blend<Mask>(set_zero(), mul(a, b));
432 return sum(multiplied);
433#endif
434 }
435};
436#endif
437
438} // namespace v1
439} // namespace v1
@ round
The end cap of the line is round.
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
The HikoGUI namespace.
Definition recursive_iterator.hpp:15
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:378
T ceil(T... args)
T div(T... args)
T floor(T... args)
T max(T... args)
T min(T... args)
T shuffle(T... args)
T sqrt(T... args)