7#include "array_intrinsic.hpp"
21hi_export_module(hikocpu : array_intrinsic_f32x4);
23hi_export
namespace hi {
26#if defined(HI_HAS_AVX)
28struct array_intrinsic<double, 4> {
29 using value_type = double;
30 using register_type = __m256d;
35 [[nodiscard]] hi_force_inline
static register_type L(array_type a)
noexcept
37 return _mm256_loadu_pd(a.data());
42 [[nodiscard]] hi_force_inline
static array_type S(register_type a)
noexcept
44 auto r = array_type{};
45 _mm256_storeu_pd(r.data(), a);
49 [[nodiscard]] hi_force_inline
static array_type undefined() noexcept
51 return S(_mm256_undefined_pd());
54 [[nodiscard]] hi_force_inline
static array_type set(
double a,
double b,
double c,
double d)
noexcept
56 return S(_mm256_set_pd(d, c, b, a));
59 [[nodiscard]] hi_force_inline
static array_type set(
double a)
noexcept
61 return S(_mm256_set_pd(0.0, 0.0, 0.0, a));
64 [[nodiscard]] hi_force_inline
static array_type set_zero() noexcept
66 return S(_mm256_setzero_pd());
69 [[nodiscard]] hi_force_inline
static array_type set_all_ones() noexcept
71 return S(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OS));
74 [[nodiscard]] hi_force_inline
static array_type set_one() noexcept
76 return S(_mm256_set1_pd(1.0f));
80 [[nodiscard]] hi_force_inline
static double get(array_type a)
noexcept
84 if constexpr (I == 0) {
85 return _mm256_cvtsd_f64(L(a));
86 }
else if constexpr (I == 1) {
87 return _mm256_cvtsd_f64(_mm256_shuffle_pd(L(a), L(a), 0b1));
89 auto const tmp = _mm256_extractf128_pd(L(a), 0b1);
90 if constexpr (I == 2) {
91 return _mm_cvtsd_f64(tmp);
93 return _mm_cvtsd_f64(_mm_permute_pd(tmp, 0b1));
98 [[nodiscard]] hi_force_inline
static array_type broadcast(
double a)
noexcept
100 return S(_mm256_set1_pd(a));
103 [[nodiscard]] hi_force_inline
static array_type broadcast(array_type a)
noexcept
106 auto lo = _mm256_extractf128_pd(tmp, 0b0);
107 tmp = _mm256_insertf128_pd(tmp, lo, 0b1);
108 return S(_mm256_permute_pd(tmp, 0b0000));
113 [[nodiscard]] hi_force_inline
static std::size_t get_mask(array_type a)
noexcept
115 return _mm256_movemask_pd(L(a));
118 [[nodiscard]] hi_force_inline
static array_type neg(array_type a)
noexcept
120 return S(_mm256_sub_pd(_mm256_setzero_pd(), L(a)));
123 template<std::
size_t Mask>
124 [[nodiscard]] hi_force_inline
constexpr static array_type neg_mask(array_type a)
noexcept
126 if constexpr (Mask == 0) {
128 }
else if constexpr (Mask == 0b1111) {
129 return S(_mm256_sub_pd(_mm256_setzero_pd(), L(a)));
130#if defined(HI_HAS_SSE3)
131 }
else if constexpr (Mask == 0b0101) {
132 return S(_mm256_addsub_pd(_mm256_setzero_pd(), L(a)));
135 auto const tmp = _mm256_sub_pd(_mm256_setzero_pd(), L(a));
136 return blend<Mask>(a, S(tmp));
140 [[nodiscard]] hi_force_inline
static array_type inv(array_type a)
noexcept
142 return _xor(set_all_ones(), a);
145 [[nodiscard]] hi_force_inline
static array_type rcp(array_type a)
noexcept
147 return S(_mm256_div_pd(_mm256_set1_pd(1.0), L(a)));
150 [[nodiscard]] hi_force_inline
static array_type sqrt(array_type a)
noexcept
152 return S(_mm256_sqrt_pd(L(a)));
155 [[nodiscard]] hi_force_inline
static array_type rsqrt(array_type a)
noexcept
157 return S(_mm256_div_pd(_mm256_set1_pd(1.0), _mm256_sqrt_pd(L(a))));
160#if defined(HI_HAS_SSE2)
161 [[nodiscard]] hi_force_inline
static array_type
round(array_type a)
noexcept
163#if defined(HI_HAS_SSE4_1)
164 return S(_mm256_round_pd(L(a), _MM_FROUND_CUR_DIRECTION));
166 auto const a_ = L(a);
167 auto const rounded = _mm256_cvtepi32_pd(_mm256_cvtps_epi32(a_));
170 auto const check_bounds = _mm256_and_pd(check_max, check_min);
172 auto const good_rounded = _mm256_and_pd(check_bounds, rounded);
173 auto const good_a = _mm256_andnot_pd(check_bounds, a_);
174 return S(_mm256_or_pd(good_rounded, good_a));
179#if defined(HI_HAS_SSE4_1)
180 [[nodiscard]] hi_force_inline
static array_type
floor(array_type a)
noexcept
182 return S(_mm256_floor_pd(L(a)));
185 [[nodiscard]] hi_force_inline
static array_type
ceil(array_type a)
noexcept
187 return S(_mm256_ceil_pd(L(a)));
191 [[nodiscard]] hi_force_inline
static array_type add(array_type a, array_type b)
noexcept
193 return S(_mm256_add_pd(L(a), L(b)));
196 [[nodiscard]] hi_force_inline
static array_type sub(array_type a, array_type b)
noexcept
198 return S(_mm256_sub_pd(L(a), L(b)));
201 template<std::
size_t Mask>
202 [[nodiscard]] hi_force_inline
constexpr static array_type addsub_mask(array_type a, array_type b)
noexcept
204 if constexpr (Mask == 0) {
206 }
else if constexpr (Mask == 0b1111) {
208#if defined(HI_HAS_SSE3)
209 }
else if constexpr (Mask == 0b1010) {
210 return S(_mm256_addsub_pd(L(a), L(b)));
213 return blend<Mask>(sub(a, b), add(a, b));
217 [[nodiscard]] hi_force_inline
static array_type mul(array_type a, array_type b)
noexcept
219 return S(_mm256_mul_pd(L(a), L(b)));
222 [[nodiscard]] hi_force_inline
static array_type
div(array_type a, array_type b)
noexcept
224 return S(_mm256_div_pd(L(a), L(b)));
227 [[nodiscard]] hi_force_inline
static array_type eq(array_type a, array_type b)
noexcept
229 return S(_mm256_cmp_pd(L(a), L(b), _CMP_EQ_OS));
232 [[nodiscard]] hi_force_inline
static array_type ne(array_type a, array_type b)
noexcept
234 return S(_mm256_cmp_pd(L(a), L(b), _CMP_NEQ_OS));
237 [[nodiscard]] hi_force_inline
static array_type lt(array_type a, array_type b)
noexcept
239 return S(_mm256_cmp_pd(L(a), L(b), _CMP_LT_OS));
242 [[nodiscard]] hi_force_inline
static array_type gt(array_type a, array_type b)
noexcept
244 return S(_mm256_cmp_pd(L(a), L(b), _CMP_GT_OS));
247 [[nodiscard]] hi_force_inline
static array_type le(array_type a, array_type b)
noexcept
249 return S(_mm256_cmp_pd(L(a), L(b), _CMP_LE_OS));
252 [[nodiscard]] hi_force_inline
static array_type ge(array_type a, array_type b)
noexcept
254 return S(_mm256_cmp_pd(L(a), L(b), _CMP_GE_OS));
257 [[nodiscard]] hi_force_inline
static array_type max(array_type a, array_type b)
noexcept
259 return S(_mm256_max_pd(L(a), L(b)));
262 [[nodiscard]] hi_force_inline
static array_type min(array_type a, array_type b)
noexcept
264 return S(_mm256_min_pd(L(a), L(b)));
267 [[nodiscard]] hi_force_inline
static array_type clamp(array_type v, array_type lo, array_type
hi)
noexcept
269 return S(_mm256_min_pd(_mm256_max_pd(L(v), L(lo)), L(
hi)));
272 [[nodiscard]] hi_force_inline
static array_type _or(array_type a, array_type b)
noexcept
274 return S(_mm256_or_pd(L(a), L(b)));
277 [[nodiscard]] hi_force_inline
static array_type _and(array_type a, array_type b)
noexcept
279 return S(_mm256_and_pd(L(a), L(b)));
282 [[nodiscard]] hi_force_inline
static array_type _xor(array_type a, array_type b)
noexcept
284 return S(_mm256_xor_pd(L(a), L(b)));
287 [[nodiscard]] hi_force_inline
static array_type andnot(array_type a, array_type b)
noexcept
289 return S(_mm256_andnot_pd(L(a), L(b)));
292#if defined(HI_HAS_SSE2)
293 [[nodiscard]] hi_force_inline
static array_type sll(array_type a,
unsigned int b)
noexcept
295 auto const b_ = _mm_set_epi32(0, 0, 0, b);
296 return S(_mm256_castsi256_pd(_mm256_sll_epi32(_mm256_castpd_si256(L(a)), b_)));
300#if defined(HI_HAS_SSE2)
301 [[nodiscard]] hi_force_inline
static array_type srl(array_type a,
unsigned int b)
noexcept
303 auto const b_ = _mm_set_epi32(0, 0, 0, b);
304 return S(_mm256_castsi256_pd(_mm256_srl_epi32(_mm256_castpd_si256(L(a)), b_)));
308#if defined(HI_HAS_SSE2)
309 [[nodiscard]] hi_force_inline
static array_type sra(array_type a,
unsigned int b)
noexcept
311 auto const b_ = _mm_set_epi32(0, 0, 0, b);
312 return S(_mm256_castsi256_pd(_mm256_sra_epi32(_mm256_castpd_si256(L(a)), b_)));
316 template<
int... Indices>
317 [[nodiscard]]
constexpr static unsigned int _make_indices_imm() noexcept
319 static_assert(
sizeof...(Indices) == 4);
321 constexpr auto indices =
std::array{Indices...};
323 for (
size_t i = 0; i != 4; ++i) {
324 auto const index = indices[i] < 0 ? i : indices[i];
325 r |= index << (i * 2);
330 template<
int... Indices>
331 [[nodiscard]] hi_force_inline
static array_type
shuffle(array_type a)
noexcept
333 return S(_mm256_shuffle_pd(L(a), L(a), _make_indices_imm<Indices...>()));
336 template<
size_t Mask>
337 [[nodiscard]] hi_force_inline
static array_type blend(array_type a, array_type b)
noexcept
339#if defined(HI_HAS_SSE4_1)
340 return S(_mm256_blend_pd(L(a), L(b), Mask));
342 auto const lo = _mm256_unpacklo_pd(L(a), L(b));
343 auto const hi = _mm256_unpackhi_pd(L(a), L(b));
345 constexpr auto indices =
346 (Mask & 0b0001 ? 0b00'00'00'01U : 0b00'00'00'00U) |
347 (Mask & 0b0010 ? 0b00'00'11'00U : 0b00'00'10'00U) |
348 (Mask & 0b0100 ? 0b00'01'00'00U : 0b00'00'00'00U) |
349 (Mask & 0b1000 ? 0b11'00'00'00U : 0b10'00'00'00U);
351 return S(_mm256_shuffle_pd(lo,
hi, indices));
355 template<
size_t Mask>
356 [[nodiscard]] hi_force_inline
static array_type dot(array_type a, array_type b)
noexcept
358#if defined(HI_HAS_SSE4_1)
359 return S(_mm256_dp_pd(L(a), L(b), (Mask << 4) | 0b1111));
361 auto const multiplied = blend<Mask>(set_zero(), mul(a, b));
362 return sum(multiplied);
@ round
The end cap of the line is round.
The HikoGUI namespace.
Definition array_generic.hpp:20
DOXYGEN BUG.
Definition algorithm_misc.hpp:20