HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
array_intrinsic_f64x4_x86.hpp
1// Copyright Take Vos 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "array_intrinsic.hpp"
8#include "macros.hpp"
9#include <cstddef>
10#include <array>
11#include <limits>
12
13#include <xmmintrin.h>
14#include <emmintrin.h>
15#include <pmmintrin.h>
16#include <tmmintrin.h>
17#include <smmintrin.h>
18#include <nmmintrin.h>
19#include <immintrin.h>
20
21hi_export_module(hikocpu : array_intrinsic_f32x4);
22
23hi_export namespace hi {
24inline namespace v1 {
25
26#if defined(HI_HAS_AVX)
27template<>
28struct array_intrinsic<double, 4> {
29 using value_type = double;
30 using register_type = __m256d;
31 using array_type = std::array<double, 4>;
32
35 [[nodiscard]] hi_force_inline static register_type L(array_type a) noexcept
36 {
37 return _mm256_loadu_pd(a.data());
38 }
39
42 [[nodiscard]] hi_force_inline static array_type S(register_type a) noexcept
43 {
44 auto r = array_type{};
45 _mm256_storeu_pd(r.data(), a);
46 return r;
47 }
48
49 [[nodiscard]] hi_force_inline static array_type undefined() noexcept
50 {
51 return S(_mm256_undefined_pd());
52 }
53
54 [[nodiscard]] hi_force_inline static array_type set(double a, double b, double c, double d) noexcept
55 {
56 return S(_mm256_set_pd(d, c, b, a));
57 }
58
59 [[nodiscard]] hi_force_inline static array_type set(double a) noexcept
60 {
61 return S(_mm256_set_pd(0.0, 0.0, 0.0, a));
62 }
63
64 [[nodiscard]] hi_force_inline static array_type set_zero() noexcept
65 {
66 return S(_mm256_setzero_pd());
67 }
68
69 [[nodiscard]] hi_force_inline static array_type set_all_ones() noexcept
70 {
71 return S(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OS));
72 }
73
74 [[nodiscard]] hi_force_inline static array_type set_one() noexcept
75 {
76 return S(_mm256_set1_pd(1.0f));
77 }
78
79 template<size_t I>
80 [[nodiscard]] hi_force_inline static double get(array_type a) noexcept
81 {
82 static_assert(I < 4);
83
84 if constexpr (I == 0) {
85 return _mm256_cvtsd_f64(L(a));
86 } else if constexpr (I == 1) {
87 return _mm256_cvtsd_f64(_mm256_shuffle_pd(L(a), L(a), 0b1));
88 } else {
89 auto const tmp = _mm256_extractf128_pd(L(a), 0b1);
90 if constexpr (I == 2) {
91 return _mm_cvtsd_f64(tmp);
92 } else {
93 return _mm_cvtsd_f64(_mm_permute_pd(tmp, 0b1));
94 }
95 }
96 }
97
98 [[nodiscard]] hi_force_inline static array_type broadcast(double a) noexcept
99 {
100 return S(_mm256_set1_pd(a));
101 }
102
103 [[nodiscard]] hi_force_inline static array_type broadcast(array_type a) noexcept
104 {
105 auto tmp = L(a);
106 auto lo = _mm256_extractf128_pd(tmp, 0b0);
107 tmp = _mm256_insertf128_pd(tmp, lo, 0b1);
108 return S(_mm256_permute_pd(tmp, 0b0000));
109 }
110
113 [[nodiscard]] hi_force_inline static std::size_t get_mask(array_type a) noexcept
114 {
115 return _mm256_movemask_pd(L(a));
116 }
117
118 [[nodiscard]] hi_force_inline static array_type neg(array_type a) noexcept
119 {
120 return S(_mm256_sub_pd(_mm256_setzero_pd(), L(a)));
121 }
122
123 template<std::size_t Mask>
124 [[nodiscard]] hi_force_inline constexpr static array_type neg_mask(array_type a) noexcept
125 {
126 if constexpr (Mask == 0) {
127 return a;
128 } else if constexpr (Mask == 0b1111) {
129 return S(_mm256_sub_pd(_mm256_setzero_pd(), L(a)));
130#if defined(HI_HAS_SSE3)
131 } else if constexpr (Mask == 0b0101) {
132 return S(_mm256_addsub_pd(_mm256_setzero_pd(), L(a)));
133#endif
134 } else {
135 auto const tmp = _mm256_sub_pd(_mm256_setzero_pd(), L(a));
136 return blend<Mask>(a, S(tmp));
137 }
138 }
139
140 [[nodiscard]] hi_force_inline static array_type inv(array_type a) noexcept
141 {
142 return _xor(set_all_ones(), a);
143 }
144
145 [[nodiscard]] hi_force_inline static array_type rcp(array_type a) noexcept
146 {
147 return S(_mm256_div_pd(_mm256_set1_pd(1.0), L(a)));
148 }
149
150 [[nodiscard]] hi_force_inline static array_type sqrt(array_type a) noexcept
151 {
152 return S(_mm256_sqrt_pd(L(a)));
153 }
154
155 [[nodiscard]] hi_force_inline static array_type rsqrt(array_type a) noexcept
156 {
157 return S(_mm256_div_pd(_mm256_set1_pd(1.0), _mm256_sqrt_pd(L(a))));
158 }
159
160#if defined(HI_HAS_SSE2)
161 [[nodiscard]] hi_force_inline static array_type round(array_type a) noexcept
162 {
163#if defined(HI_HAS_SSE4_1)
164 return S(_mm256_round_pd(L(a), _MM_FROUND_CUR_DIRECTION));
165#else
166 auto const a_ = L(a);
167 auto const rounded = _mm256_cvtepi32_pd(_mm256_cvtps_epi32(a_));
168 auto const check_max = _mm256_cmple_pd(a_, _mm256_set1_pd(static_cast<float>(std::numeric_limits<int32_t>::max())));
169 auto const check_min = _mm256_cmpge_pd(a_, _mm256_set1_pd(static_cast<float>(std::numeric_limits<int32_t>::min())));
170 auto const check_bounds = _mm256_and_pd(check_max, check_min);
171
172 auto const good_rounded = _mm256_and_pd(check_bounds, rounded);
173 auto const good_a = _mm256_andnot_pd(check_bounds, a_);
174 return S(_mm256_or_pd(good_rounded, good_a));
175#endif
176 }
177#endif
178
179#if defined(HI_HAS_SSE4_1)
180 [[nodiscard]] hi_force_inline static array_type floor(array_type a) noexcept
181 {
182 return S(_mm256_floor_pd(L(a)));
183 }
184
185 [[nodiscard]] hi_force_inline static array_type ceil(array_type a) noexcept
186 {
187 return S(_mm256_ceil_pd(L(a)));
188 }
189#endif
190
191 [[nodiscard]] hi_force_inline static array_type add(array_type a, array_type b) noexcept
192 {
193 return S(_mm256_add_pd(L(a), L(b)));
194 }
195
196 [[nodiscard]] hi_force_inline static array_type sub(array_type a, array_type b) noexcept
197 {
198 return S(_mm256_sub_pd(L(a), L(b)));
199 }
200
201 template<std::size_t Mask>
202 [[nodiscard]] hi_force_inline constexpr static array_type addsub_mask(array_type a, array_type b) noexcept
203 {
204 if constexpr (Mask == 0) {
205 return sub(a, b);
206 } else if constexpr (Mask == 0b1111) {
207 return add(a, b);
208#if defined(HI_HAS_SSE3)
209 } else if constexpr (Mask == 0b1010) {
210 return S(_mm256_addsub_pd(L(a), L(b)));
211#endif
212 } else {
213 return blend<Mask>(sub(a, b), add(a, b));
214 }
215 }
216
217 [[nodiscard]] hi_force_inline static array_type mul(array_type a, array_type b) noexcept
218 {
219 return S(_mm256_mul_pd(L(a), L(b)));
220 }
221
222 [[nodiscard]] hi_force_inline static array_type div(array_type a, array_type b) noexcept
223 {
224 return S(_mm256_div_pd(L(a), L(b)));
225 }
226
227 [[nodiscard]] hi_force_inline static array_type eq(array_type a, array_type b) noexcept
228 {
229 return S(_mm256_cmp_pd(L(a), L(b), _CMP_EQ_OS));
230 }
231
232 [[nodiscard]] hi_force_inline static array_type ne(array_type a, array_type b) noexcept
233 {
234 return S(_mm256_cmp_pd(L(a), L(b), _CMP_NEQ_OS));
235 }
236
237 [[nodiscard]] hi_force_inline static array_type lt(array_type a, array_type b) noexcept
238 {
239 return S(_mm256_cmp_pd(L(a), L(b), _CMP_LT_OS));
240 }
241
242 [[nodiscard]] hi_force_inline static array_type gt(array_type a, array_type b) noexcept
243 {
244 return S(_mm256_cmp_pd(L(a), L(b), _CMP_GT_OS));
245 }
246
247 [[nodiscard]] hi_force_inline static array_type le(array_type a, array_type b) noexcept
248 {
249 return S(_mm256_cmp_pd(L(a), L(b), _CMP_LE_OS));
250 }
251
252 [[nodiscard]] hi_force_inline static array_type ge(array_type a, array_type b) noexcept
253 {
254 return S(_mm256_cmp_pd(L(a), L(b), _CMP_GE_OS));
255 }
256
257 [[nodiscard]] hi_force_inline static array_type max(array_type a, array_type b) noexcept
258 {
259 return S(_mm256_max_pd(L(a), L(b)));
260 }
261
262 [[nodiscard]] hi_force_inline static array_type min(array_type a, array_type b) noexcept
263 {
264 return S(_mm256_min_pd(L(a), L(b)));
265 }
266
267 [[nodiscard]] hi_force_inline static array_type clamp(array_type v, array_type lo, array_type hi) noexcept
268 {
269 return S(_mm256_min_pd(_mm256_max_pd(L(v), L(lo)), L(hi)));
270 }
271
272 [[nodiscard]] hi_force_inline static array_type _or(array_type a, array_type b) noexcept
273 {
274 return S(_mm256_or_pd(L(a), L(b)));
275 }
276
277 [[nodiscard]] hi_force_inline static array_type _and(array_type a, array_type b) noexcept
278 {
279 return S(_mm256_and_pd(L(a), L(b)));
280 }
281
282 [[nodiscard]] hi_force_inline static array_type _xor(array_type a, array_type b) noexcept
283 {
284 return S(_mm256_xor_pd(L(a), L(b)));
285 }
286
287 [[nodiscard]] hi_force_inline static array_type andnot(array_type a, array_type b) noexcept
288 {
289 return S(_mm256_andnot_pd(L(a), L(b)));
290 }
291
292#if defined(HI_HAS_SSE2)
293 [[nodiscard]] hi_force_inline static array_type sll(array_type a, unsigned int b) noexcept
294 {
295 auto const b_ = _mm_set_epi32(0, 0, 0, b);
296 return S(_mm256_castsi256_pd(_mm256_sll_epi32(_mm256_castpd_si256(L(a)), b_)));
297 }
298#endif
299
300#if defined(HI_HAS_SSE2)
301 [[nodiscard]] hi_force_inline static array_type srl(array_type a, unsigned int b) noexcept
302 {
303 auto const b_ = _mm_set_epi32(0, 0, 0, b);
304 return S(_mm256_castsi256_pd(_mm256_srl_epi32(_mm256_castpd_si256(L(a)), b_)));
305 }
306#endif
307
308#if defined(HI_HAS_SSE2)
309 [[nodiscard]] hi_force_inline static array_type sra(array_type a, unsigned int b) noexcept
310 {
311 auto const b_ = _mm_set_epi32(0, 0, 0, b);
312 return S(_mm256_castsi256_pd(_mm256_sra_epi32(_mm256_castpd_si256(L(a)), b_)));
313 }
314#endif
315
316 template<int... Indices>
317 [[nodiscard]] constexpr static unsigned int _make_indices_imm() noexcept
318 {
319 static_assert(sizeof...(Indices) == 4);
320
321 constexpr auto indices = std::array{Indices...};
322 auto r = 0U;
323 for (size_t i = 0; i != 4; ++i) {
324 auto const index = indices[i] < 0 ? i : indices[i];
325 r |= index << (i * 2);
326 }
327 return r;
328 }
329
330 template<int... Indices>
331 [[nodiscard]] hi_force_inline static array_type shuffle(array_type a) noexcept
332 {
333 return S(_mm256_shuffle_pd(L(a), L(a), _make_indices_imm<Indices...>()));
334 }
335
336 template<size_t Mask>
337 [[nodiscard]] hi_force_inline static array_type blend(array_type a, array_type b) noexcept
338 {
339#if defined(HI_HAS_SSE4_1)
340 return S(_mm256_blend_pd(L(a), L(b), Mask));
341#else
342 auto const lo = _mm256_unpacklo_pd(L(a), L(b));
343 auto const hi = _mm256_unpackhi_pd(L(a), L(b));
344 // clang-format off
345 constexpr auto indices =
346 (Mask & 0b0001 ? 0b00'00'00'01U : 0b00'00'00'00U) |
347 (Mask & 0b0010 ? 0b00'00'11'00U : 0b00'00'10'00U) |
348 (Mask & 0b0100 ? 0b00'01'00'00U : 0b00'00'00'00U) |
349 (Mask & 0b1000 ? 0b11'00'00'00U : 0b10'00'00'00U);
350 // clang-format on
351 return S(_mm256_shuffle_pd(lo, hi, indices));
352#endif
353 }
354
355 template<size_t Mask>
356 [[nodiscard]] hi_force_inline static array_type dot(array_type a, array_type b) noexcept
357 {
358#if defined(HI_HAS_SSE4_1)
359 return S(_mm256_dp_pd(L(a), L(b), (Mask << 4) | 0b1111));
360#else
361 auto const multiplied = blend<Mask>(set_zero(), mul(a, b));
362 return sum(multiplied);
363#endif
364 }
365};
366#endif
367
368} // namespace v1
369} // namespace v1
@ round
The end cap of the line is round.
The HikoGUI namespace.
Definition array_generic.hpp:20
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
T ceil(T... args)
T div(T... args)
T floor(T... args)
T shuffle(T... args)