HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
array_intrinsic_f32x4_x86.hpp
1// Copyright Take Vos 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "array_intrinsic.hpp"
8#include "half.hpp"
9#include "half_to_float.hpp"
10#include "macros.hpp"
11#include <cstddef>
12#include <array>
13#include <limits>
14
15#include <xmmintrin.h>
16#include <emmintrin.h>
17#include <pmmintrin.h>
18#include <tmmintrin.h>
19#include <smmintrin.h>
20#include <nmmintrin.h>
21#include <immintrin.h>
22
23hi_export_module(hikocpu : array_intrinsic_f32x4);
24
25hi_export namespace hi {
26inline namespace v1 {
27
28#if defined(HI_HAS_SSE)
29template<>
30struct array_intrinsic<float, 4> {
31 using value_type = float;
32 using register_type = __m128;
33 using array_type = std::array<float, 4>;
34
37 [[nodiscard]] hi_force_inline static register_type L(array_type a) noexcept
38 {
39 return _mm_loadu_ps(a.data());
40 }
41
44 [[nodiscard]] hi_force_inline static array_type S(register_type a) noexcept
45 {
46 auto r = array_type{};
47 _mm_storeu_ps(r.data(), a);
48 return r;
49 }
50
51 [[nodiscard]] hi_force_inline static array_type convert(std::array<half, 4> a) noexcept
52 {
53 return half_to_float(std::bit_cast<std::array<uint16_t, 4>>(a));
54 }
55
56 [[nodiscard]] hi_force_inline static array_type undefined() noexcept
57 {
58 return S(_mm_undefined_ps());
59 }
60
61 [[nodiscard]] hi_force_inline static array_type set(float a, float b, float c, float d) noexcept
62 {
63 return S(_mm_set_ps(d, c, b, a));
64 }
65
66 [[nodiscard]] hi_force_inline static array_type set(float a) noexcept
67 {
68 return S(_mm_set_ps(0.0f, 0.0f, 0.0f, a));
69 }
70
71 [[nodiscard]] hi_force_inline static array_type set_zero() noexcept
72 {
73 return S(_mm_setzero_ps());
74 }
75
76 [[nodiscard]] hi_force_inline static array_type set_all_ones() noexcept
77 {
78#if defined(HI_HAS_SSE2)
79 return S(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())));
80#else
81 return S(_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps()));
82#endif
83 }
84
85 [[nodiscard]] hi_force_inline static array_type set_one() noexcept
86 {
87#if defined(HI_HAS_SSE2)
88 auto const ones = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
89 return S(_mm_castsi128_ps(_mm_srli_epi32(_mm_slli_epi32(ones, 25), 2)));
90#else
91 return S(_mm_set1_ps(1.0f));
92#endif
93 }
94
95 template<size_t I>
96 [[nodiscard]] hi_force_inline static float get(array_type a) noexcept
97 {
98 if constexpr (I == 0) {
99 return _mm_cvtss_f32(L(a));
100 } else {
101 return _mm_cvtss_f32(_mm_shuffle_ps(L(a), L(a), I));
102 }
103 }
104
105 [[nodiscard]] hi_force_inline static array_type broadcast(float a) noexcept
106 {
107 return S(_mm_set1_ps(a));
108 }
109
110 [[nodiscard]] hi_force_inline static array_type broadcast(array_type a) noexcept
111 {
112 return S(_mm_shuffle_ps(L(a), L(a), 0));
113 }
114
115#if defined(HI_HAS_SSE2)
116 [[nodiscard]] hi_force_inline static array_type set_mask(std::size_t mask) noexcept
117 {
118 // clang-format off
119 auto const tmp = _mm_set_epi32(
120 static_cast<int32_t>(mask) << 28,
121 static_cast<int32_t>(mask) << 29,
122 static_cast<int32_t>(mask) << 30,
123 static_cast<int32_t>(mask) << 31);
124 // clang-format on
125 return S(_mm_castsi128_ps(_mm_srai_epi32(tmp, 31)));
126 }
127#endif
128
131 [[nodiscard]] hi_force_inline static std::size_t get_mask(array_type a) noexcept
132 {
133 return _mm_movemask_ps(L(a));
134 }
135
136
137 [[nodiscard]] hi_force_inline static array_type neg(array_type a) noexcept
138 {
139 return S(_mm_sub_ps(_mm_setzero_ps(), L(a)));
140 }
141
142 template<std::size_t Mask>
143 [[nodiscard]] hi_force_inline constexpr static array_type neg_mask(array_type a) noexcept
144 {
145 if constexpr (Mask == 0) {
146 return a;
147 } else if constexpr (Mask == 0b1111) {
148 return S(_mm_sub_ps(_mm_setzero_ps(), L(a)));
149#if defined(HI_HAS_SSE3)
150 } else if constexpr (Mask == 0b0101) {
151 return S(_mm_addsub_ps(_mm_setzero_ps(), L(a)));
152#endif
153 } else {
154 auto const tmp = _mm_sub_ps(_mm_setzero_ps(), L(a));
155 return blend<Mask>(a, S(tmp));
156 }
157 }
158
159 [[nodiscard]] hi_force_inline static array_type inv(array_type a) noexcept
160 {
161 return _xor(set_all_ones(), a);
162 }
163
164 [[nodiscard]] hi_force_inline static array_type rcp(array_type a) noexcept
165 {
166 return S(_mm_rcp_ps(L(a)));
167 }
168
169 [[nodiscard]] hi_force_inline static array_type sqrt(array_type a) noexcept
170 {
171 return S(_mm_sqrt_ps(L(a)));
172 }
173
174 [[nodiscard]] hi_force_inline static array_type rsqrt(array_type a) noexcept
175 {
176 return S(_mm_rsqrt_ps(L(a)));
177 }
178
179#if defined(HI_HAS_SSE2)
180 [[nodiscard]] hi_force_inline static array_type round(array_type a) noexcept
181 {
182#if defined(HI_HAS_SSE4_1)
183 return S(_mm_round_ps(L(a), _MM_FROUND_CUR_DIRECTION));
184#else
185 auto const a_ = L(a);
186 auto const rounded = _mm_cvtepi32_ps(_mm_cvtps_epi32(a_));
187 auto const check_max = _mm_cmple_ps(a_, _mm_set1_ps(static_cast<float>(std::numeric_limits<int32_t>::max())));
188 auto const check_min = _mm_cmpge_ps(a_, _mm_set1_ps(static_cast<float>(std::numeric_limits<int32_t>::min())));
189 auto const check_bounds = _mm_and_ps(check_max, check_min);
190
191 auto const good_rounded = _mm_and_ps(check_bounds, rounded);
192 auto const good_a = _mm_andnot_ps(check_bounds, a_);
193 return S(_mm_or_ps(good_rounded, good_a));
194#endif
195 }
196#endif
197
198#if defined(HI_HAS_SSE4_1)
199 [[nodiscard]] hi_force_inline static array_type floor(array_type a) noexcept
200 {
201 return S(_mm_floor_ps(L(a)));
202 }
203
204 [[nodiscard]] hi_force_inline static array_type ceil(array_type a) noexcept
205 {
206 return S(_mm_ceil_ps(L(a)));
207 }
208#endif
209
210 [[nodiscard]] hi_force_inline static array_type add(array_type a, array_type b) noexcept
211 {
212 return S(_mm_add_ps(L(a), L(b)));
213 }
214
215 [[nodiscard]] hi_force_inline static array_type sub(array_type a, array_type b) noexcept
216 {
217 return S(_mm_sub_ps(L(a), L(b)));
218 }
219
220 template<std::size_t Mask>
221 [[nodiscard]] hi_force_inline constexpr static array_type addsub_mask(array_type a, array_type b) noexcept
222 {
223 if constexpr (Mask == 0) {
224 return sub(a, b);
225 } else if constexpr (Mask == 0b1111) {
226 return add(a, b);
227#if defined(HI_HAS_SSE3)
228 } else if constexpr (Mask == 0b1010) {
229 return S(_mm_addsub_ps(L(a), L(b)));
230#endif
231 } else {
232 return blend<Mask>(sub(a, b), add(a, b));
233 }
234 }
235
236 [[nodiscard]] hi_force_inline static array_type mul(array_type a, array_type b) noexcept
237 {
238 return S(_mm_mul_ps(L(a), L(b)));
239 }
240
241 [[nodiscard]] hi_force_inline static array_type div(array_type a, array_type b) noexcept
242 {
243 return S(_mm_div_ps(L(a), L(b)));
244 }
245
246 [[nodiscard]] hi_force_inline static array_type eq(array_type a, array_type b) noexcept
247 {
248 return S(_mm_cmpeq_ps(L(a), L(b)));
249 }
250
251 [[nodiscard]] hi_force_inline static array_type ne(array_type a, array_type b) noexcept
252 {
253 return S(_mm_cmpneq_ps(L(a), L(b)));
254 }
255
256 [[nodiscard]] hi_force_inline static array_type lt(array_type a, array_type b) noexcept
257 {
258 return S(_mm_cmplt_ps(L(a), L(b)));
259 }
260
261 [[nodiscard]] hi_force_inline static array_type gt(array_type a, array_type b) noexcept
262 {
263 return S(_mm_cmpgt_ps(L(a), L(b)));
264 }
265
266 [[nodiscard]] hi_force_inline static array_type le(array_type a, array_type b) noexcept
267 {
268 return S(_mm_cmple_ps(L(a), L(b)));
269 }
270
271 [[nodiscard]] hi_force_inline static array_type ge(array_type a, array_type b) noexcept
272 {
273 return S(_mm_cmpge_ps(L(a), L(b)));
274 }
275
276 [[nodiscard]] hi_force_inline static bool test(array_type a, array_type b) noexcept
277 {
278#if defined(HI_HAS_SSE4_1)
279 return static_cast<bool>(_mm_testz_si128(_mm_castps_si128(L(a)), _mm_castps_si128(L(b))));
280#elif defined(HI_HAS_SSE2)
281 return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_castps_si128(_mm_and_ps(L(a), L(b))), _mm_setzero_si128())) == 0xffff;
282#else
283 auto tmp = std::array<float, 4>{};
284 _mm_store_ps(tmp.data(), _mm_and_ps(L(a), L(b)));
285
286 return (std::bit_cast<uint32_t>(std::get<0>(tmp)) | std::bit_cast<uint32_t>(std::get<1>(tmp)) |
287 std::bit_cast<uint32_t>(std::get<2>(tmp)) | std::bit_cast<uint32_t>(std::get<3>(tmp))) == 0;
288#endif
289 }
290
291 [[nodiscard]] hi_force_inline static array_type max(array_type a, array_type b) noexcept
292 {
293 return S(_mm_max_ps(L(a), L(b)));
294 }
295
296 [[nodiscard]] hi_force_inline static array_type min(array_type a, array_type b) noexcept
297 {
298 return S(_mm_min_ps(L(a), L(b)));
299 }
300
301 [[nodiscard]] hi_force_inline static array_type clamp(array_type v, array_type lo, array_type hi) noexcept
302 {
303 return S(_mm_min_ps(_mm_max_ps(L(v), L(lo)), L(hi)));
304 }
305
306 [[nodiscard]] hi_force_inline static array_type _or(array_type a, array_type b) noexcept
307 {
308 return S(_mm_or_ps(L(a), L(b)));
309 }
310
311 [[nodiscard]] hi_force_inline static array_type _and(array_type a, array_type b) noexcept
312 {
313 return S(_mm_and_ps(L(a), L(b)));
314 }
315
316 [[nodiscard]] hi_force_inline static array_type _xor(array_type a, array_type b) noexcept
317 {
318 return S(_mm_xor_ps(L(a), L(b)));
319 }
320
321 [[nodiscard]] hi_force_inline static array_type andnot(array_type a, array_type b) noexcept
322 {
323 return S(_mm_andnot_ps(L(a), L(b)));
324 }
325
326#if defined(HI_HAS_SSE2)
327 [[nodiscard]] hi_force_inline static array_type sll(array_type a, unsigned int b) noexcept
328 {
329 auto const b_ = _mm_set_epi32(0, 0, 0, b);
330 return S(_mm_castsi128_ps(_mm_sll_epi32(_mm_castps_si128(L(a)), b_)));
331 }
332#endif
333
334#if defined(HI_HAS_SSE2)
335 [[nodiscard]] hi_force_inline static array_type srl(array_type a, unsigned int b) noexcept
336 {
337 auto const b_ = _mm_set_epi32(0, 0, 0, b);
338 return S(_mm_castsi128_ps(_mm_srl_epi32(_mm_castps_si128(L(a)), b_)));
339 }
340#endif
341
342#if defined(HI_HAS_SSE2)
343 [[nodiscard]] hi_force_inline static array_type sra(array_type a, unsigned int b) noexcept
344 {
345 auto const b_ = _mm_set_epi32(0, 0, 0, b);
346 return S(_mm_castsi128_ps(_mm_sra_epi32(_mm_castps_si128(L(a)), b_)));
347 }
348#endif
349
350 [[nodiscard]] hi_force_inline static array_type hadd(array_type a, array_type b) noexcept
351 {
352#if defined(HI_HAS_SSE3)
353 return S(_mm_hadd_ps(L(a), L(b)));
354#else
355 auto const a_ = L(a);
356 auto const b_ = L(b);
357 auto const tmp1 = _mm_shuffle_ps(a_, b_, 0b10'00'10'00);
358 auto const tmp2 = _mm_shuffle_ps(a_, b_, 0b11'01'11'01);
359 return S(_mm_add_ps(tmp1, tmp2));
360#endif
361 }
362
363 [[nodiscard]] hi_force_inline static array_type hsub(array_type a, array_type b) noexcept
364 {
365#if defined(HI_HAS_SSE3)
366 return S(_mm_hsub_ps(L(a), L(b)));
367#else
368 auto const a_ = L(a);
369 auto const b_ = L(b);
370 auto const tmp1 = _mm_shuffle_ps(a_, b_, 0b10'00'10'00);
371 auto const tmp2 = _mm_shuffle_ps(a_, b_, 0b11'01'11'01);
372 return S(_mm_sub_ps(tmp1, tmp2));
373#endif
374 }
375
376 template<int... Indices>
377 [[nodiscard]] constexpr static unsigned int _make_indices_imm() noexcept
378 {
379 static_assert(sizeof...(Indices) == 4);
380
381 constexpr auto indices = std::array{Indices...};
382 auto r = 0U;
383 for (size_t i = 0; i != 4; ++i) {
384 auto const index = indices[i] < 0 ? i : indices[i];
385 r |= index << (i * 2);
386 }
387 return r;
388 }
389
390 template<int... Indices>
391 [[nodiscard]] hi_force_inline static array_type shuffle(array_type a) noexcept
392 {
393 return S(_mm_shuffle_ps(L(a), L(a), _make_indices_imm<Indices...>()));
394 }
395
396 template<size_t Mask>
397 [[nodiscard]] hi_force_inline static array_type blend(array_type a, array_type b) noexcept
398 {
399#if defined(HI_HAS_SSE4_1)
400 return S(_mm_blend_ps(L(a), L(b), Mask));
401#else
402 auto const lo = _mm_unpacklo_ps(L(a), L(b));
403 auto const hi = _mm_unpackhi_ps(L(a), L(b));
404 // clang-format off
405 constexpr auto indices =
406 (Mask & 0b0001 ? 0b00'00'00'01U : 0b00'00'00'00U) |
407 (Mask & 0b0010 ? 0b00'00'11'00U : 0b00'00'10'00U) |
408 (Mask & 0b0100 ? 0b00'01'00'00U : 0b00'00'00'00U) |
409 (Mask & 0b1000 ? 0b11'00'00'00U : 0b10'00'00'00U);
410 // clang-format on
411 return S(_mm_shuffle_ps(lo, hi, indices));
412#endif
413 }
414
415 [[nodiscard]] hi_force_inline static std::array<array_type, 4> transpose(array_type a, array_type b, array_type c, array_type d)
416 {
417 auto a_ = L(a);
418 auto b_ = L(b);
419 auto c_ = L(c);
420 auto d_ = L(d);
421 _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
422 return {S(a_), S(b_), S(c_), S(d_)};
423 }
424
425 [[nodiscard]] hi_force_inline static array_type sum(array_type a) noexcept
426 {
427 auto const x_y_z_w = L(a);
428 auto const y_x_w_z = _mm_shuffle_ps(x_y_z_w, x_y_z_w, 0b10'11'00'01);
429 auto const xy_yx_zw_wz = _mm_add_ps(x_y_z_w, y_x_w_z);
430 auto const zw_wz_w_z = _mm_movehl_ps(y_x_w_z, xy_yx_zw_wz);
431 auto const xyzw_0_0_0 = _mm_add_ss(xy_yx_zw_wz, zw_wz_w_z);
432 return S(_mm_shuffle_ps(xyzw_0_0_0, xyzw_0_0_0, 0));
433 }
434
435 template<size_t Mask>
436 [[nodiscard]] hi_force_inline static array_type dot(array_type a, array_type b) noexcept
437 {
438#if defined(HI_HAS_SSE4_1)
439 return S(_mm_dp_ps(L(a), L(b), (Mask << 4) | 0b1111));
440#else
441 auto const multiplied = blend<Mask>(set_zero(), mul(a, b));
442 return sum(multiplied);
443#endif
444 }
445};
446#endif
447
448} // namespace v1
449} // namespace v1
@ round
The end cap of the line is round.
The HikoGUI namespace.
Definition array_generic.hpp:20
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
T ceil(T... args)
T div(T... args)
T floor(T... args)
T shuffle(T... args)