7#include "array_intrinsic.hpp"
21hi_export_module(hikocpu : array_intrinsic_f32x4);
23hi_export
namespace hi {
26#if defined(HI_HAS_SSE2)
28struct array_intrinsic<double, 2> {
29 using value_type = double;
30 using register_type = __m128d;
35 [[nodiscard]] hi_force_inline
static register_type L(array_type a)
noexcept
37 return _mm_loadu_pd(a.data());
42 [[nodiscard]] hi_force_inline
static array_type S(register_type a)
noexcept
44 auto r = array_type{};
45 _mm_storeu_pd(r.data(), a);
49 [[nodiscard]] hi_force_inline
static array_type undefined() noexcept
51 return S(_mm_undefined_pd());
54 [[nodiscard]] hi_force_inline
static array_type set(
float a,
float b)
noexcept
56 return S(_mm_set_pd(b, a));
59 [[nodiscard]] hi_force_inline
static array_type set(
float a)
noexcept
61 return S(_mm_set_pd(0.0, a));
64 [[nodiscard]] hi_force_inline
static array_type set_zero() noexcept
66 return S(_mm_setzero_pd());
69 [[nodiscard]] hi_force_inline
static array_type set_all_ones() noexcept
71 return S(_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())));
74 [[nodiscard]] hi_force_inline
static array_type set_one() noexcept
76 auto const ones = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
77 return S(_mm_castsi128_pd(_mm_srli_epi64(_mm_slli_epi64(ones, 54), 2)));
81 [[nodiscard]] hi_force_inline
static float get(array_type a)
noexcept
84 if constexpr (I == 0) {
85 return _mm_cvtsd_f64(L(a));
87 return _mm_cvtsd_f64(_mm_shuffle_pd(L(a), L(a), I));
91 [[nodiscard]] hi_force_inline
static array_type broadcast(
float a)
noexcept
93 return S(_mm_set1_pd(a));
96 [[nodiscard]] hi_force_inline
static array_type broadcast(array_type a)
noexcept
98 return S(_mm_shuffle_pd(L(a), L(a), 0));
101 [[nodiscard]] hi_force_inline
static array_type set_mask(
std::size_t mask)
noexcept
104 auto const tmp = _mm_set_epi32(
105 static_cast<int32_t
>(mask) << 30,
106 static_cast<int32_t
>(mask) << 30,
107 static_cast<int32_t
>(mask) << 31,
108 static_cast<int32_t
>(mask) << 31);
111 return S(_mm_castsi128_pd(_mm_srai_epi32(tmp, 31)));
116 [[nodiscard]] hi_force_inline
static std::size_t get_mask(array_type a)
noexcept
118 return _mm_movemask_pd(L(a));
121 [[nodiscard]] hi_force_inline
static array_type neg(array_type a)
noexcept
123 return S(_mm_sub_pd(_mm_setzero_pd(), L(a)));
126 template<std::
size_t Mask>
127 [[nodiscard]] hi_force_inline
constexpr static array_type neg_mask(array_type a)
noexcept
129 if constexpr (Mask == 0) {
131 }
else if constexpr (Mask == 0b11) {
132 return S(_mm_sub_pd(_mm_setzero_pd(), L(a)));
133#if defined(HI_HAS_SSE3)
134 }
else if constexpr (Mask == 0b01) {
135 return S(_mm_addsub_pd(_mm_setzero_pd(), L(a)));
138 auto const tmp = _mm_sub_pd(_mm_setzero_pd(), L(a));
139 return blend<Mask>(a, S(tmp));
143 [[nodiscard]] hi_force_inline
static array_type inv(array_type a)
noexcept
145 return _xor(set_all_ones(), a);
148 [[nodiscard]] hi_force_inline
static array_type sqrt(array_type a)
noexcept
150 return S(_mm_sqrt_pd(L(a)));
153#if defined(HI_HAS_SSE4_1)
154 [[nodiscard]] hi_force_inline
static array_type
round(array_type a)
noexcept
156 return S(_mm_round_pd(L(a), _MM_FROUND_CUR_DIRECTION));
159 [[nodiscard]] hi_force_inline
static array_type
floor(array_type a)
noexcept
161 return S(_mm_floor_pd(L(a)));
164 [[nodiscard]] hi_force_inline
static array_type
ceil(array_type a)
noexcept
166 return S(_mm_ceil_pd(L(a)));
170 [[nodiscard]] hi_force_inline
static array_type add(array_type a, array_type b)
noexcept
172 return S(_mm_add_pd(L(a), L(b)));
175 [[nodiscard]] hi_force_inline
static array_type sub(array_type a, array_type b)
noexcept
177 return S(_mm_sub_pd(L(a), L(b)));
180 template<std::
size_t Mask>
181 [[nodiscard]] hi_force_inline
constexpr static array_type addsub_mask(array_type a, array_type b)
noexcept
183 if constexpr (Mask == 0) {
185 }
else if constexpr (Mask == 0b11) {
187#if defined(HI_HAS_SSE3)
188 }
else if constexpr (Mask == 0b10) {
189 return S(_mm_addsub_pd(L(a), L(b)));
192 return blend<Mask>(sub(a, b), add(a, b));
196 [[nodiscard]] hi_force_inline
static array_type mul(array_type a, array_type b)
noexcept
198 return S(_mm_mul_pd(L(a), L(b)));
201 [[nodiscard]] hi_force_inline
static array_type
div(array_type a, array_type b)
noexcept
203 return S(_mm_div_pd(L(a), L(b)));
206 [[nodiscard]] hi_force_inline
static array_type eq(array_type a, array_type b)
noexcept
208 return S(_mm_cmpeq_pd(L(a), L(b)));
211 [[nodiscard]] hi_force_inline
static array_type ne(array_type a, array_type b)
noexcept
213 return S(_mm_cmpneq_pd(L(a), L(b)));
216 [[nodiscard]] hi_force_inline
static array_type lt(array_type a, array_type b)
noexcept
218 return S(_mm_cmplt_pd(L(a), L(b)));
221 [[nodiscard]] hi_force_inline
static array_type gt(array_type a, array_type b)
noexcept
223 return S(_mm_cmpgt_pd(L(a), L(b)));
226 [[nodiscard]] hi_force_inline
static array_type le(array_type a, array_type b)
noexcept
228 return S(_mm_cmple_pd(L(a), L(b)));
231 [[nodiscard]] hi_force_inline
static array_type ge(array_type a, array_type b)
noexcept
233 return S(_mm_cmpge_pd(L(a), L(b)));
236 [[nodiscard]] hi_force_inline
static bool test(array_type a, array_type b)
noexcept
238#if defined(HI_HAS_SSE4_1)
239 return static_cast<bool>(_mm_testz_si128(_mm_castpd_si128(L(a)), _mm_castpd_si128(L(b))));
241 return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_castpd_si128(_mm_and_pd(L(a), L(b))), _mm_setzero_si128())) == 0xffff;
245 [[nodiscard]] hi_force_inline
static array_type max(array_type a, array_type b)
noexcept
247 return S(_mm_max_pd(L(a), L(b)));
250 [[nodiscard]] hi_force_inline
static array_type min(array_type a, array_type b)
noexcept
252 return S(_mm_min_pd(L(a), L(b)));
255 [[nodiscard]] hi_force_inline
static array_type clamp(array_type v, array_type lo, array_type
hi)
noexcept
257 return S(_mm_min_pd(_mm_max_pd(L(v), L(lo)), L(
hi)));
260 [[nodiscard]] hi_force_inline
static array_type _or(array_type a, array_type b)
noexcept
262 return S(_mm_or_pd(L(a), L(b)));
265 [[nodiscard]] hi_force_inline
static array_type _and(array_type a, array_type b)
noexcept
267 return S(_mm_and_pd(L(a), L(b)));
270 [[nodiscard]] hi_force_inline
static array_type _xor(array_type a, array_type b)
noexcept
272 return S(_mm_xor_pd(L(a), L(b)));
275 [[nodiscard]] hi_force_inline
static array_type andnot(array_type a, array_type b)
noexcept
277 return S(_mm_andnot_pd(L(a), L(b)));
280 [[nodiscard]] hi_force_inline
static array_type sll(array_type a,
unsigned int b)
noexcept
282 auto const b_ = _mm_set_epi32(0, 0, 0, b);
283 return S(_mm_castsi128_pd(_mm_sll_epi64(_mm_castpd_si128(L(a)), b_)));
286 [[nodiscard]] hi_force_inline
static array_type srl(array_type a,
unsigned int b)
noexcept
288 auto const b_ = _mm_set_epi32(0, 0, 0, b);
289 return S(_mm_castsi128_pd(_mm_srl_epi64(_mm_castpd_si128(L(a)), b_)));
292 [[nodiscard]] hi_force_inline
static array_type sra(array_type a,
unsigned int b)
noexcept
294 auto const b_ = _mm_set_epi32(0, 0, 0, b);
295 return S(_mm_castsi128_pd(_mm_sra_epi64(_mm_castpd_si128(L(a)), b_)));
298#if defined(HI_HAS_SSE3)
299 [[nodiscard]] hi_force_inline
static array_type hadd(array_type a, array_type b)
noexcept
301 return S(_mm_hadd_pd(L(a), L(b)));
305#if defined(HI_HAS_SSE3)
306 [[nodiscard]] hi_force_inline
static array_type hsub(array_type a, array_type b)
noexcept
308 return S(_mm_hsub_pd(L(a), L(b)));
312 template<
int... Indices>
313 [[nodiscard]]
constexpr static unsigned int _make_indices_imm() noexcept
315 static_assert(
sizeof...(Indices) == 2);
317 constexpr auto indices =
std::array{Indices...};
319 for (
size_t i = 0; i != 2; ++i) {
320 auto const index = indices[i] < 0 ? i : indices[i];
321 r |= index << (i * 2);
326 template<
int... Indices>
327 [[nodiscard]] hi_force_inline
static array_type
shuffle(array_type a)
noexcept
329 return S(_mm_shuffle_pd(L(a), L(a), _make_indices_imm<Indices...>()));
332 template<
size_t Mask>
333 [[nodiscard]] hi_force_inline
static array_type blend(array_type a, array_type b)
noexcept
335#if defined(HI_HAS_SSE4_1)
336 return S(_mm_blend_pd(L(a), L(b), Mask));
338 auto const lo = _mm_unpacklo_pd(L(a), L(b));
339 auto const hi = _mm_unpackhi_pd(L(a), L(b));
340 return S(_mm_shuffle_pd(lo,
hi, Mask));
344 [[nodiscard]] hi_force_inline
static array_type sum(array_type a)
noexcept
346 auto const tmp = _mm_shuffle_pd(L(a), L(a), 0b01);
347 return S(_mm_add_pd(L(a), tmp));
350 template<
size_t Mask>
351 [[nodiscard]] hi_force_inline
static array_type dot(array_type a, array_type b)
noexcept
353#if defined(HI_HAS_SSE4_1)
354 return S(_mm_dp_pd(L(a), L(b), (Mask << 2) | 0b11));
356 auto const multiplied = blend<Mask>(set_zero(), mul(a, b));
357 return sum(multiplied);
@ round
The end cap of the line is round.
The HikoGUI namespace.
Definition array_generic.hpp:20
DOXYGEN BUG.
Definition algorithm_misc.hpp:20