7#include "array_intrinsic.hpp"
9#include "half_to_float.hpp"
23hi_export_module(hikocpu : array_intrinsic_f32x4);
25hi_export
namespace hi {
28#if defined(HI_HAS_SSE)
30struct array_intrinsic<float, 4> {
31 using value_type = float;
32 using register_type = __m128;
37 [[nodiscard]] hi_force_inline
static register_type L(array_type a)
noexcept
39 return _mm_loadu_ps(a.data());
44 [[nodiscard]] hi_force_inline
static array_type S(register_type a)
noexcept
46 auto r = array_type{};
47 _mm_storeu_ps(r.data(), a);
56 [[nodiscard]] hi_force_inline
static array_type undefined() noexcept
58 return S(_mm_undefined_ps());
61 [[nodiscard]] hi_force_inline
static array_type set(
float a,
float b,
float c,
float d)
noexcept
63 return S(_mm_set_ps(d, c, b, a));
66 [[nodiscard]] hi_force_inline
static array_type set(
float a)
noexcept
68 return S(_mm_set_ps(0.0f, 0.0f, 0.0f, a));
71 [[nodiscard]] hi_force_inline
static array_type set_zero() noexcept
73 return S(_mm_setzero_ps());
76 [[nodiscard]] hi_force_inline
static array_type set_all_ones() noexcept
78#if defined(HI_HAS_SSE2)
79 return S(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())));
81 return S(_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps()));
85 [[nodiscard]] hi_force_inline
static array_type set_one() noexcept
87#if defined(HI_HAS_SSE2)
88 auto const ones = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
89 return S(_mm_castsi128_ps(_mm_srli_epi32(_mm_slli_epi32(ones, 25), 2)));
91 return S(_mm_set1_ps(1.0f));
96 [[nodiscard]] hi_force_inline
static float get(array_type a)
noexcept
98 if constexpr (I == 0) {
99 return _mm_cvtss_f32(L(a));
101 return _mm_cvtss_f32(_mm_shuffle_ps(L(a), L(a), I));
105 [[nodiscard]] hi_force_inline
static array_type broadcast(
float a)
noexcept
107 return S(_mm_set1_ps(a));
110 [[nodiscard]] hi_force_inline
static array_type broadcast(array_type a)
noexcept
112 return S(_mm_shuffle_ps(L(a), L(a), 0));
115#if defined(HI_HAS_SSE2)
116 [[nodiscard]] hi_force_inline
static array_type set_mask(
std::size_t mask)
noexcept
119 auto const tmp = _mm_set_epi32(
120 static_cast<int32_t
>(mask) << 28,
121 static_cast<int32_t
>(mask) << 29,
122 static_cast<int32_t
>(mask) << 30,
123 static_cast<int32_t
>(mask) << 31);
125 return S(_mm_castsi128_ps(_mm_srai_epi32(tmp, 31)));
131 [[nodiscard]] hi_force_inline
static std::size_t get_mask(array_type a)
noexcept
133 return _mm_movemask_ps(L(a));
137 [[nodiscard]] hi_force_inline
static array_type neg(array_type a)
noexcept
139 return S(_mm_sub_ps(_mm_setzero_ps(), L(a)));
142 template<std::
size_t Mask>
143 [[nodiscard]] hi_force_inline
constexpr static array_type neg_mask(array_type a)
noexcept
145 if constexpr (Mask == 0) {
147 }
else if constexpr (Mask == 0b1111) {
148 return S(_mm_sub_ps(_mm_setzero_ps(), L(a)));
149#if defined(HI_HAS_SSE3)
150 }
else if constexpr (Mask == 0b0101) {
151 return S(_mm_addsub_ps(_mm_setzero_ps(), L(a)));
154 auto const tmp = _mm_sub_ps(_mm_setzero_ps(), L(a));
155 return blend<Mask>(a, S(tmp));
159 [[nodiscard]] hi_force_inline
static array_type inv(array_type a)
noexcept
161 return _xor(set_all_ones(), a);
164 [[nodiscard]] hi_force_inline
static array_type rcp(array_type a)
noexcept
166 return S(_mm_rcp_ps(L(a)));
169 [[nodiscard]] hi_force_inline
static array_type sqrt(array_type a)
noexcept
171 return S(_mm_sqrt_ps(L(a)));
174 [[nodiscard]] hi_force_inline
static array_type rsqrt(array_type a)
noexcept
176 return S(_mm_rsqrt_ps(L(a)));
179#if defined(HI_HAS_SSE2)
180 [[nodiscard]] hi_force_inline
static array_type
round(array_type a)
noexcept
182#if defined(HI_HAS_SSE4_1)
183 return S(_mm_round_ps(L(a), _MM_FROUND_CUR_DIRECTION));
185 auto const a_ = L(a);
186 auto const rounded = _mm_cvtepi32_ps(_mm_cvtps_epi32(a_));
189 auto const check_bounds = _mm_and_ps(check_max, check_min);
191 auto const good_rounded = _mm_and_ps(check_bounds, rounded);
192 auto const good_a = _mm_andnot_ps(check_bounds, a_);
193 return S(_mm_or_ps(good_rounded, good_a));
198#if defined(HI_HAS_SSE4_1)
199 [[nodiscard]] hi_force_inline
static array_type
floor(array_type a)
noexcept
201 return S(_mm_floor_ps(L(a)));
204 [[nodiscard]] hi_force_inline
static array_type
ceil(array_type a)
noexcept
206 return S(_mm_ceil_ps(L(a)));
210 [[nodiscard]] hi_force_inline
static array_type add(array_type a, array_type b)
noexcept
212 return S(_mm_add_ps(L(a), L(b)));
215 [[nodiscard]] hi_force_inline
static array_type sub(array_type a, array_type b)
noexcept
217 return S(_mm_sub_ps(L(a), L(b)));
220 template<std::
size_t Mask>
221 [[nodiscard]] hi_force_inline
constexpr static array_type addsub_mask(array_type a, array_type b)
noexcept
223 if constexpr (Mask == 0) {
225 }
else if constexpr (Mask == 0b1111) {
227#if defined(HI_HAS_SSE3)
228 }
else if constexpr (Mask == 0b1010) {
229 return S(_mm_addsub_ps(L(a), L(b)));
232 return blend<Mask>(sub(a, b), add(a, b));
236 [[nodiscard]] hi_force_inline
static array_type mul(array_type a, array_type b)
noexcept
238 return S(_mm_mul_ps(L(a), L(b)));
241 [[nodiscard]] hi_force_inline
static array_type
div(array_type a, array_type b)
noexcept
243 return S(_mm_div_ps(L(a), L(b)));
246 [[nodiscard]] hi_force_inline
static array_type eq(array_type a, array_type b)
noexcept
248 return S(_mm_cmpeq_ps(L(a), L(b)));
251 [[nodiscard]] hi_force_inline
static array_type ne(array_type a, array_type b)
noexcept
253 return S(_mm_cmpneq_ps(L(a), L(b)));
256 [[nodiscard]] hi_force_inline
static array_type lt(array_type a, array_type b)
noexcept
258 return S(_mm_cmplt_ps(L(a), L(b)));
261 [[nodiscard]] hi_force_inline
static array_type gt(array_type a, array_type b)
noexcept
263 return S(_mm_cmpgt_ps(L(a), L(b)));
266 [[nodiscard]] hi_force_inline
static array_type le(array_type a, array_type b)
noexcept
268 return S(_mm_cmple_ps(L(a), L(b)));
271 [[nodiscard]] hi_force_inline
static array_type ge(array_type a, array_type b)
noexcept
273 return S(_mm_cmpge_ps(L(a), L(b)));
276 [[nodiscard]] hi_force_inline
static bool test(array_type a, array_type b)
noexcept
278#if defined(HI_HAS_SSE4_1)
279 return static_cast<bool>(_mm_testz_si128(_mm_castps_si128(L(a)), _mm_castps_si128(L(b))));
280#elif defined(HI_HAS_SSE2)
281 return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_castps_si128(_mm_and_ps(L(a), L(b))), _mm_setzero_si128())) == 0xffff;
284 _mm_store_ps(tmp.data(), _mm_and_ps(L(a), L(b)));
286 return (std::bit_cast<uint32_t>(std::get<0>(tmp)) | std::bit_cast<uint32_t>(std::get<1>(tmp)) |
287 std::bit_cast<uint32_t>(std::get<2>(tmp)) | std::bit_cast<uint32_t>(std::get<3>(tmp))) == 0;
291 [[nodiscard]] hi_force_inline
static array_type max(array_type a, array_type b)
noexcept
293 return S(_mm_max_ps(L(a), L(b)));
296 [[nodiscard]] hi_force_inline
static array_type min(array_type a, array_type b)
noexcept
298 return S(_mm_min_ps(L(a), L(b)));
301 [[nodiscard]] hi_force_inline
static array_type clamp(array_type v, array_type lo, array_type
hi)
noexcept
303 return S(_mm_min_ps(_mm_max_ps(L(v), L(lo)), L(
hi)));
306 [[nodiscard]] hi_force_inline
static array_type _or(array_type a, array_type b)
noexcept
308 return S(_mm_or_ps(L(a), L(b)));
311 [[nodiscard]] hi_force_inline
static array_type _and(array_type a, array_type b)
noexcept
313 return S(_mm_and_ps(L(a), L(b)));
316 [[nodiscard]] hi_force_inline
static array_type _xor(array_type a, array_type b)
noexcept
318 return S(_mm_xor_ps(L(a), L(b)));
321 [[nodiscard]] hi_force_inline
static array_type andnot(array_type a, array_type b)
noexcept
323 return S(_mm_andnot_ps(L(a), L(b)));
326#if defined(HI_HAS_SSE2)
327 [[nodiscard]] hi_force_inline
static array_type sll(array_type a,
unsigned int b)
noexcept
329 auto const b_ = _mm_set_epi32(0, 0, 0, b);
330 return S(_mm_castsi128_ps(_mm_sll_epi32(_mm_castps_si128(L(a)), b_)));
334#if defined(HI_HAS_SSE2)
335 [[nodiscard]] hi_force_inline
static array_type srl(array_type a,
unsigned int b)
noexcept
337 auto const b_ = _mm_set_epi32(0, 0, 0, b);
338 return S(_mm_castsi128_ps(_mm_srl_epi32(_mm_castps_si128(L(a)), b_)));
342#if defined(HI_HAS_SSE2)
343 [[nodiscard]] hi_force_inline
static array_type sra(array_type a,
unsigned int b)
noexcept
345 auto const b_ = _mm_set_epi32(0, 0, 0, b);
346 return S(_mm_castsi128_ps(_mm_sra_epi32(_mm_castps_si128(L(a)), b_)));
350 [[nodiscard]] hi_force_inline
static array_type hadd(array_type a, array_type b)
noexcept
352#if defined(HI_HAS_SSE3)
353 return S(_mm_hadd_ps(L(a), L(b)));
355 auto const a_ = L(a);
356 auto const b_ = L(b);
357 auto const tmp1 = _mm_shuffle_ps(a_, b_, 0b10'00'10'00);
358 auto const tmp2 = _mm_shuffle_ps(a_, b_, 0b11'01'11'01);
359 return S(_mm_add_ps(tmp1, tmp2));
363 [[nodiscard]] hi_force_inline
static array_type hsub(array_type a, array_type b)
noexcept
365#if defined(HI_HAS_SSE3)
366 return S(_mm_hsub_ps(L(a), L(b)));
368 auto const a_ = L(a);
369 auto const b_ = L(b);
370 auto const tmp1 = _mm_shuffle_ps(a_, b_, 0b10'00'10'00);
371 auto const tmp2 = _mm_shuffle_ps(a_, b_, 0b11'01'11'01);
372 return S(_mm_sub_ps(tmp1, tmp2));
376 template<
int... Indices>
377 [[nodiscard]]
constexpr static unsigned int _make_indices_imm() noexcept
379 static_assert(
sizeof...(Indices) == 4);
381 constexpr auto indices =
std::array{Indices...};
383 for (
size_t i = 0; i != 4; ++i) {
384 auto const index = indices[i] < 0 ? i : indices[i];
385 r |= index << (i * 2);
390 template<
int... Indices>
391 [[nodiscard]] hi_force_inline
static array_type
shuffle(array_type a)
noexcept
393 return S(_mm_shuffle_ps(L(a), L(a), _make_indices_imm<Indices...>()));
396 template<
size_t Mask>
397 [[nodiscard]] hi_force_inline
static array_type blend(array_type a, array_type b)
noexcept
399#if defined(HI_HAS_SSE4_1)
400 return S(_mm_blend_ps(L(a), L(b), Mask));
402 auto const lo = _mm_unpacklo_ps(L(a), L(b));
403 auto const hi = _mm_unpackhi_ps(L(a), L(b));
405 constexpr auto indices =
406 (Mask & 0b0001 ? 0b00'00'00'01U : 0b00'00'00'00U) |
407 (Mask & 0b0010 ? 0b00'00'11'00U : 0b00'00'10'00U) |
408 (Mask & 0b0100 ? 0b00'01'00'00U : 0b00'00'00'00U) |
409 (Mask & 0b1000 ? 0b11'00'00'00U : 0b10'00'00'00U);
411 return S(_mm_shuffle_ps(lo,
hi, indices));
415 [[nodiscard]] hi_force_inline
static std::array<array_type, 4> transpose(array_type a, array_type b, array_type c, array_type d)
421 _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
422 return {S(a_), S(b_), S(c_), S(d_)};
425 [[nodiscard]] hi_force_inline
static array_type sum(array_type a)
noexcept
427 auto const x_y_z_w = L(a);
428 auto const y_x_w_z = _mm_shuffle_ps(x_y_z_w, x_y_z_w, 0b10'11'00'01);
429 auto const xy_yx_zw_wz = _mm_add_ps(x_y_z_w, y_x_w_z);
430 auto const zw_wz_w_z = _mm_movehl_ps(y_x_w_z, xy_yx_zw_wz);
431 auto const xyzw_0_0_0 = _mm_add_ss(xy_yx_zw_wz, zw_wz_w_z);
432 return S(_mm_shuffle_ps(xyzw_0_0_0, xyzw_0_0_0, 0));
435 template<
size_t Mask>
436 [[nodiscard]] hi_force_inline
static array_type dot(array_type a, array_type b)
noexcept
438#if defined(HI_HAS_SSE4_1)
439 return S(_mm_dp_ps(L(a), L(b), (Mask << 4) | 0b1111));
441 auto const multiplied = blend<Mask>(set_zero(), mul(a, b));
442 return sum(multiplied);
@ round
The end cap of the line is round.
The HikoGUI namespace.
Definition array_generic.hpp:20
DOXYGEN BUG.
Definition algorithm_misc.hpp:20