HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
numeric_array.hpp
1// Copyright Take Vos 2020-2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../architecture.hpp"
8#include "../concepts.hpp"
9#include "../cast.hpp"
10#include "../type_traits.hpp"
11#include "../float16.hpp"
12#include "../math.hpp"
13
14#if defined(HI_HAS_AVX)
15#include "swizzle_avx.hpp"
16#include <immintrin.h> // AVX, AVX2, FMA
17#endif
18#if defined(HI_HAS_SSE4_2)
19#include <nmmintrin.h> // SSE4.2
20#endif
21#if defined(HI_HAS_SSE4_1)
22#include "float16_sse4_1.hpp"
23#include <smmintrin.h> // SSE4.1
24#include <ammintrin.h> // SSE4A
25#endif
26#if defined(HI_HAS_SSSE3)
27#include <tmmintrin.h> // SSSE3
28#endif
29#if defined(HI_HAS_SSE3)
30#include <pmmintrin.h> // SSE3
31#endif
32#if defined(HI_HAS_SSE2)
33#include <emmintrin.h> // SSE2
34#endif
35#if defined(HI_HAS_SSE)
36#include <xmmintrin.h> // SSE
37#endif
38
39#include <cstdint>
40#include <ostream>
41#include <string>
42#include <array>
43#include <type_traits>
44#include <concepts>
45#include <bit>
46#include <climits>
47#include <utility>
48
49hi_warning_push();
50// C4702 unreachable code: Suppressed due intrinsics and std::is_constant_evaluated()
51hi_warning_ignore_msvc(4702);
52// C26490: Don't use reinterpret_cast (type.1).
53// Needed for casting pointers to or from SSE registers.
54hi_warning_ignore_msvc(26490);
55
56namespace hi::inline v1 {
57
58template<numeric_limited T, std::size_t N>
61 using value_type = typename container_type::value_type;
62 using size_type = typename container_type::size_type;
63 using difference_type = typename container_type::difference_type;
64 using reference = typename container_type::reference;
65 using const_reference = typename container_type::const_reference;
66 using pointer = typename container_type::pointer;
67 using const_pointer = typename container_type::const_pointer;
68 using iterator = typename container_type::iterator;
69 using const_iterator = typename container_type::const_iterator;
70
71 constexpr static bool is_i8x1 = std::is_same_v<T, int8_t> && N == 1;
72 constexpr static bool is_i8x2 = std::is_same_v<T, int8_t> && N == 2;
73 constexpr static bool is_i8x4 = std::is_same_v<T, int8_t> && N == 4;
74 constexpr static bool is_i8x8 = std::is_same_v<T, int8_t> && N == 8;
75 constexpr static bool is_i8x16 = std::is_same_v<T, int8_t> && N == 16;
76 constexpr static bool is_i8x32 = std::is_same_v<T, int8_t> && N == 32;
77 constexpr static bool is_i8x64 = std::is_same_v<T, int8_t> && N == 64;
78 constexpr static bool is_u8x1 = std::is_same_v<T, uint8_t> && N == 1;
79 constexpr static bool is_u8x2 = std::is_same_v<T, uint8_t> && N == 2;
80 constexpr static bool is_u8x4 = std::is_same_v<T, uint8_t> && N == 4;
81 constexpr static bool is_u8x8 = std::is_same_v<T, uint8_t> && N == 8;
82 constexpr static bool is_u8x16 = std::is_same_v<T, uint8_t> && N == 16;
83 constexpr static bool is_u8x32 = std::is_same_v<T, uint8_t> && N == 32;
84 constexpr static bool is_u8x64 = std::is_same_v<T, uint8_t> && N == 64;
85
86 constexpr static bool is_i16x1 = std::is_same_v<T, int16_t> && N == 1;
87 constexpr static bool is_i16x2 = std::is_same_v<T, int16_t> && N == 2;
88 constexpr static bool is_i16x4 = std::is_same_v<T, int16_t> && N == 4;
89 constexpr static bool is_i16x8 = std::is_same_v<T, int16_t> && N == 8;
90 constexpr static bool is_i16x16 = std::is_same_v<T, int16_t> && N == 16;
91 constexpr static bool is_i16x32 = std::is_same_v<T, int16_t> && N == 32;
92 constexpr static bool is_u16x1 = std::is_same_v<T, uint16_t> && N == 1;
93 constexpr static bool is_u16x2 = std::is_same_v<T, uint16_t> && N == 2;
94 constexpr static bool is_u16x4 = std::is_same_v<T, uint16_t> && N == 4;
95 constexpr static bool is_u16x8 = std::is_same_v<T, uint16_t> && N == 8;
96 constexpr static bool is_u16x16 = std::is_same_v<T, uint16_t> && N == 16;
97 constexpr static bool is_u16x32 = std::is_same_v<T, uint16_t> && N == 32;
98 constexpr static bool is_f16x4 = std::is_same_v<T, float16> && N == 4;
99
100 constexpr static bool is_i32x1 = std::is_same_v<T, int32_t> && N == 1;
101 constexpr static bool is_i32x2 = std::is_same_v<T, int32_t> && N == 2;
102 constexpr static bool is_i32x4 = std::is_same_v<T, int32_t> && N == 4;
103 constexpr static bool is_i32x8 = std::is_same_v<T, int32_t> && N == 8;
104 constexpr static bool is_i32x16 = std::is_same_v<T, int32_t> && N == 16;
105 constexpr static bool is_u32x1 = std::is_same_v<T, uint32_t> && N == 1;
106 constexpr static bool is_u32x2 = std::is_same_v<T, uint32_t> && N == 2;
107 constexpr static bool is_u32x4 = std::is_same_v<T, uint32_t> && N == 4;
108 constexpr static bool is_u32x8 = std::is_same_v<T, uint32_t> && N == 8;
109 constexpr static bool is_u32x16 = std::is_same_v<T, uint32_t> && N == 16;
110 constexpr static bool is_f32x1 = std::is_same_v<T, float> && N == 1;
111 constexpr static bool is_f32x2 = std::is_same_v<T, float> && N == 2;
112 constexpr static bool is_f32x4 = std::is_same_v<T, float> && N == 4;
113 constexpr static bool is_f32x8 = std::is_same_v<T, float> && N == 8;
114 constexpr static bool is_f32x16 = std::is_same_v<T, float> && N == 16;
115
116 constexpr static bool is_i64x1 = std::is_same_v<T, int64_t> && N == 1;
117 constexpr static bool is_i64x2 = std::is_same_v<T, int64_t> && N == 2;
118 constexpr static bool is_i64x4 = std::is_same_v<T, int64_t> && N == 4;
119 constexpr static bool is_i64x8 = std::is_same_v<T, int64_t> && N == 8;
120 constexpr static bool is_u64x1 = std::is_same_v<T, uint64_t> && N == 1;
121 constexpr static bool is_u64x2 = std::is_same_v<T, uint64_t> && N == 2;
122 constexpr static bool is_u64x4 = std::is_same_v<T, uint64_t> && N == 4;
123 constexpr static bool is_u64x8 = std::is_same_v<T, uint64_t> && N == 8;
124 constexpr static bool is_f64x1 = std::is_same_v<T, double> && N == 1;
125 constexpr static bool is_f64x2 = std::is_same_v<T, double> && N == 2;
126 constexpr static bool is_f64x4 = std::is_same_v<T, double> && N == 4;
127 constexpr static bool is_f64x8 = std::is_same_v<T, double> && N == 8;
128
130
131 constexpr numeric_array() noexcept
132 {
133 if (not std::is_constant_evaluated()) {
134#if defined(HI_HAS_AVX)
135 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x16 or is_u16x16 or is_i8x32 or is_u8x32) {
136 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), _mm256_setzero_si256());
137 return;
138 } else if constexpr (is_f64x4) {
139 _mm256_storeu_pd(reinterpret_cast<__m256d *>(v.data()), _mm256_setzero_pd());
140 return;
141 } else if constexpr (is_f32x8) {
142 _mm256_storeu_ps(v.data(), _mm256_setzero_ps());
143 return;
144 }
145#endif
146#if defined(HI_HAS_SSE2)
147 if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_u8x16) {
148 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), _mm_setzero_si128());
149 return;
150 } else if constexpr (is_f64x2) {
151 _mm_storeu_pd(reinterpret_cast<__m128d *>(v.data()), _mm_setzero_pd());
152 return;
153 }
154#endif
155#if defined(HI_HAS_SSE)
156 if constexpr (is_f32x4) {
157 _mm_storeu_ps(v.data(), _mm_setzero_ps());
158 return;
159 }
160#endif
161 }
162
163 for (auto i = 0_uz; i != N; ++i) {
164 v[i] = T{};
165 }
166 }
167
168 constexpr numeric_array(numeric_array const& rhs) noexcept = default;
169 constexpr numeric_array(numeric_array&& rhs) noexcept = default;
170 constexpr numeric_array& operator=(numeric_array const& rhs) noexcept = default;
171 constexpr numeric_array& operator=(numeric_array&& rhs) noexcept = default;
172
173 template<numeric_limited U, std::size_t M>
174 [[nodiscard]] constexpr explicit numeric_array(numeric_array<U, M> const& other) noexcept : v()
175 {
176 if (!std::is_constant_evaluated()) {
177#if defined(HI_HAS_AVX)
178 if constexpr (is_f64x4 and other.is_f32x4) {
179 v = numeric_array{_mm256_cvteps_pd(other.reg())};
180 return;
181 } else if constexpr (is_f64x4 and other.is_i32x4) {
182 v = numeric_array{_mm256_cvtepi32_pd(other.reg())};
183 return;
184 } else if constexpr (is_f32x4 and other.is_f64x4) {
185 v = numeric_array{_mm256_cvtpd_ps(other.reg())};
186 return;
187 } else if constexpr (is_i32x4 and other.is_f64x4) {
188 v = numeric_array{_mm256_cvtpd_epi32(other.reg())};
189 return;
190 } else if constexpr (is_i32x8 and other.is_f32x8) {
191 v = numeric_array{_mm256_cvtps_epi32(other.reg())};
192 return;
193 } else if constexpr (is_f32x8 and other.is_i32x8) {
194 v = numeric_array{_mm256_cvtepi32_ps(other.reg())};
195 return;
196 }
197#endif
198#if defined(HI_HAS_SSE4_1)
199 if constexpr (is_u8x4 and other.is_f32x4) {
200 hilet i32_4 = _mm_cvtps_epi32(other.reg());
201 hilet i16_8 = _mm_packs_epi32(i32_4, _mm_setzero_si128());
202 hilet u8_16 = _mm_packus_epi16(i16_8, _mm_setzero_si128());
203 v = numeric_array{u8_16};
204 return;
205 } else if constexpr (is_i64x4 and other.is_i32x4) {
206 v = numeric_array{_mm_cvtepi32_epi64(other.reg())};
207 return;
208 } else if constexpr (is_i64x4 and other.is_i16x8) {
209 v = numeric_array{_mm_cvtepi16_epi64(other.reg())};
210 return;
211 } else if constexpr (is_i32x4 and other.is_i16x8) {
212 v = numeric_array{_mm_cvtepi16_epi32(other.reg())};
213 return;
214 } else if constexpr (is_i64x2 and other.is_i8x16) {
215 v = numeric_array{_mm_cvtepi8_epi64(other.reg())};
216 return;
217 } else if constexpr (is_i32x4 and other.is_i8x16) {
218 v = numeric_array{_mm_cvtepi8_epi32(other.reg())};
219 return;
220 } else if constexpr (is_i16x8 and other.is_i8x16) {
221 v = numeric_array{_mm_cvtepi8_epi16(other.reg())};
222 return;
223 } else if constexpr (is_f16x4 and other.is_f32x4) {
224 v = numeric_array{_mm_cvtps_ph_sse4_1(other.reg())};
225 return;
226 } else if constexpr (is_f32x4 and other.is_f16x4) {
227 v = numeric_array{_mm_cvtph_ps_sse2(other.reg())};
228 return;
229 }
230
231#endif
232#if defined(HI_HAS_SSE2)
233 if constexpr (is_f64x2 and other.is_i32x4) {
234 v = numeric_array{_mm_cvtepi32_pd(other.reg())};
235 return;
236 } else if constexpr (is_f32x4 and other.is_i32x4) {
237 v = numeric_array{_mm_cvtepi32_ps(other.reg())};
238 return;
239 } else if constexpr (is_i32x4 and other.is_f32x4) {
240 v = numeric_array{_mm_cvtps_epi32(other.reg())};
241 return;
242 }
243#endif
244 }
245
246 for (std::size_t i = 0; i != N; ++i) {
247 if (i < M) {
248 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
249 // SSE conversion round floats before converting to integer.
250 v[i] = static_cast<value_type>(std::round(other[i]));
251 } else {
252 v[i] = static_cast<value_type>(other[i]);
253 }
254 } else {
255 v[i] = T{};
256 }
257 }
258 }
259
260 template<numeric_limited U, std::size_t M>
261 [[nodiscard]] constexpr explicit numeric_array(numeric_array<U, M> const& other1, numeric_array<U, M> const& other2) noexcept
262 :
263 v()
264 {
265 if (!std::is_constant_evaluated()) {
266#if defined(HI_HAS_AVX)
267 if constexpr (is_f64x4 and other1.is_f64x2 and other2.is_f64x2) {
268 v = numeric_array{_mm256_set_m128d(other2.reg(), other1.reg())};
269 } else if constexpr (is_f32x8 and other1.is_f32x4 and other2.is_f32x4) {
270 v = numeric_array{_mm256_set_m128(other2.reg(), other1.reg())};
271 } else if constexpr (
272 std::is_integral_v<T> and std::is_integral_v<U> and (sizeof(T) * N == 32) and (sizeof(U) * M == 16)) {
273 v = numeric_array{_mm256_set_m128i(other2.reg(), other1.reg())};
274 }
275#endif
276#if defined(HI_HAS_SSE4_1)
277 if constexpr (is_u16x8 and other1.is_u32x4 and other2.is_u32x4) {
278 v = numeric_array{_mm_packus_epu32(other2.reg(), other1.reg())};
279 }
280#endif
281#if defined(HI_HAS_SSE2)
282 if constexpr (is_i16x8 and other1.is_i32x4 and other2.is_i32x4) {
283 v = numeric_array{_mm_packs_epi32(other2.reg(), other1.reg())};
284 } else if constexpr (is_i8x16 and other1.is_i16x8 and other2.is_i16x8) {
285 v = numeric_array{_mm_packs_epi16(other2.reg(), other1.reg())};
286 } else if constexpr (is_u8x16 and other1.is_u16x8 and other2.is_u16x8) {
287 v = numeric_array{_mm_packus_epu16(other2.reg(), other1.reg())};
288 }
289#endif
290 }
291
292 for (std::size_t i = 0; i != N; ++i) {
293 if (i < M) {
294 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
295 // SSE conversion round floats before converting to integer.
296 v[i] = static_cast<value_type>(std::round(other1[i]));
297 } else {
298 v[i] = static_cast<value_type>(other1[i]);
299 }
300 } else if (i < M * 2) {
301 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
302 // SSE conversion round floats before converting to integer.
303 v[i] = static_cast<value_type>(std::round(other2[i - M]));
304 } else {
305 v[i] = static_cast<value_type>(other2[i - M]);
306 }
307 } else {
308 v[i] = U{};
309 }
310 }
311 }
312
313 [[nodiscard]] constexpr explicit numeric_array(T const& x) noexcept : v()
314 {
315 if (not std::is_constant_evaluated()) {
316#if defined(HI_HAS_SSE)
317 if constexpr (is_f32x4) {
318 *this = numeric_array{_mm_set_ss(x)};
319 return;
320 }
321#endif
322 }
323 get<0>(v) = x;
324 }
325
326 [[nodiscard]] constexpr explicit numeric_array(T const& x, T const& y) noexcept requires(N >= 2) : v()
327 {
328 if (not std::is_constant_evaluated()) {
329#if defined(HI_HAS_SSE2)
330 if constexpr (is_i32x4) {
331 *this = numeric_array{_mm_set_epi32(0, 0, y, x)};
332 return;
333 }
334#endif
335 }
336 get<0>(v) = x;
337 get<1>(v) = y;
338 }
339
340 [[nodiscard]] constexpr explicit numeric_array(T const& x, T const& y, T const& z) noexcept requires(N >= 3) : v()
341 {
342 if (not std::is_constant_evaluated()) {
343#if defined(HI_HAS_SSE2)
344 if constexpr (is_i32x4) {
345 *this = numeric_array{_mm_set_epi32(0, z, y, x)};
346 return;
347 }
348#endif
349 }
350 get<0>(v) = x;
351 get<1>(v) = y;
352 get<2>(v) = z;
353 }
354
355 [[nodiscard]] constexpr explicit numeric_array(T const& x, T const& y, T const& z, T const& w) noexcept requires(N >= 4) : v()
356 {
357 if (not std::is_constant_evaluated()) {
358#if defined(HI_HAS_SSE2)
359 if constexpr (is_i32x4) {
360 *this = numeric_array{_mm_set_epi32(w, z, y, x)};
361 return;
362 }
363#endif
364 }
365 get<0>(v) = x;
366 get<1>(v) = y;
367 get<2>(v) = z;
368 get<3>(v) = w;
369 }
370
371 [[nodiscard]] static constexpr numeric_array broadcast(T rhs) noexcept
372 {
373 if (not std::is_constant_evaluated()) {
374#if defined(HI_HAS_AVX)
375 if constexpr (is_f64x4) {
376 return numeric_array{_mm256_set1_pd(rhs)};
377 } else if constexpr (is_f32x8) {
378 return numeric_array{_mm256_set1_ps(rhs)};
379 } else if constexpr (is_i64x4) {
380 return numeric_array{_mm256_set1_epi64x(rhs)};
381 } else if constexpr (is_i32x8) {
382 return numeric_array{_mm256_set1_epi32(rhs)};
383 } else if constexpr (is_i16x16) {
384 return numeric_array{_mm256_set1_epi16(rhs)};
385 } else if constexpr (is_i8x32) {
386 return numeric_array{_mm256_set1_epi8(rhs)};
387 }
388#endif
389#if defined(HI_HAS_SSE2)
390 if constexpr (is_f64x2) {
391 return numeric_array{_mm_set1_pd(rhs)};
392 } else if constexpr (is_i64x2) {
393 return numeric_array{_mm_set1_epi64x(rhs)};
394 } else if constexpr (is_i32x4) {
395 return numeric_array{_mm_set1_epi32(rhs)};
396 } else if constexpr (is_i16x8) {
397 return numeric_array{_mm_set1_epi16(rhs)};
398 } else if constexpr (is_i8x16) {
399 return numeric_array{_mm_set1_epi8(rhs)};
400 }
401#endif
402#if defined(HI_HAS_SSE)
403 if constexpr (is_f32x4) {
404 return numeric_array{_mm_set1_ps(rhs)};
405 }
406#endif
407 }
408 auto r = numeric_array{};
409 for (std::size_t i = 0; i != N; ++i) {
410 r[i] = rhs;
411 }
412 return r;
413 }
414
415 [[nodiscard]] static constexpr numeric_array epsilon() noexcept
416 {
417 if constexpr (std::is_floating_point_v<T>) {
418 return broadcast(std::numeric_limits<T>::min());
419 } else {
420 return broadcast(T{0});
421 }
422 }
423
424 [[nodiscard]] numeric_array(std::array<T, N> const& rhs) noexcept : v(rhs) {}
425
426 numeric_array& operator=(std::array<T, N> const& rhs) noexcept
427 {
428 v = rhs;
429 return *this;
430 }
431
432 [[nodiscard]] operator std::array<T, N>() const noexcept
433 {
434 return v;
435 }
436
437#if defined(HI_HAS_SSE2)
438 [[nodiscard]] __m128i reg() const noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
439 {
440 return _mm_loadu_si128(reinterpret_cast<__m128i const *>(v.data()));
441 }
442
443 [[nodiscard]] __m128i reg() const noexcept requires(is_f16x4)
444 {
445 return _mm_set_epi16(0, 0, 0, 0, get<3>(v).get(), get<2>(v).get(), get<1>(v).get(), get<0>(v).get());
446 }
447#endif
448
449#if defined(HI_HAS_SSE2)
450 [[nodiscard]] __m128 reg() const noexcept requires(is_f32x4)
451 {
452 return _mm_loadu_ps(v.data());
453 }
454#endif
455
456#if defined(HI_HAS_SSE2)
457 [[nodiscard]] __m128d reg() const noexcept requires(is_f64x2)
458 {
459 return _mm_loadu_pd(v.data());
460 }
461#endif
462
463#if defined(HI_HAS_SSE2)
464 [[nodiscard]] explicit numeric_array(__m128i const& rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
465 {
466 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), rhs);
467 }
468#endif
469
470#if defined(HI_HAS_SSE4_1)
471 [[nodiscard]] explicit numeric_array(__m128i const& rhs) noexcept requires(is_f16x4) :
472 v(std::bit_cast<decltype(v)>(_mm_extract_epi64(rhs, 0)))
473 {
474 }
475#endif
476
477#if defined(HI_HAS_SSE4_1)
478 [[nodiscard]] explicit numeric_array(__m128i const& rhs) noexcept requires(is_u8x4) :
479 v(std::bit_cast<decltype(v)>(_mm_extract_epi32(rhs, 0)))
480 {
481 }
482#endif
483
484#if defined(HI_HAS_SSE2)
485 [[nodiscard]] explicit numeric_array(__m128 const& rhs) noexcept requires(is_f32x4)
486 {
487 _mm_storeu_ps(v.data(), rhs);
488 }
489#endif
490
491#if defined(HI_HAS_SSE2)
492 [[nodiscard]] explicit numeric_array(__m128d const& rhs) noexcept requires(is_f64x2)
493 {
494 _mm_storeu_pd(v.data(), rhs);
495 }
496#endif
497
498#if defined(HI_HAS_SSE2)
499 numeric_array& operator=(__m128i const& rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
500 {
501 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), rhs);
502 return *this;
503 }
504#endif
505
506#if defined(HI_HAS_SSE2)
507 numeric_array& operator=(__m128 const& rhs) noexcept requires(is_f32x4)
508 {
509 _mm_storeu_ps(v.data(), rhs);
510 return *this;
511 }
512#endif
513
514#if defined(HI_HAS_SSE2)
515 numeric_array& operator=(__m128d const& rhs) noexcept requires(is_f64x2)
516 {
517 _mm_storeu_pd(v.data(), rhs);
518 return *this;
519 }
520#endif
521
522#if defined(HI_HAS_AVX)
523 [[nodiscard]] __m256i reg() const noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
524 {
525 return _mm256_loadu_si256(reinterpret_cast<__m256i const *>(v.data()));
526 }
527#endif
528
529#if defined(HI_HAS_AVX)
530 [[nodiscard]] __m256 reg() const noexcept requires(is_f32x8)
531 {
532 return _mm256_loadu_ps(v.data());
533 }
534#endif
535
536#if defined(HI_HAS_AVX)
537 [[nodiscard]] __m256d reg() const noexcept requires(is_f64x4)
538 {
539 return _mm256_loadu_pd(v.data());
540 }
541#endif
542
543#if defined(HI_HAS_AVX)
544 [[nodiscard]] explicit numeric_array(__m256i const& rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
545 {
546 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), rhs);
547 }
548#endif
549
550#if defined(HI_HAS_AVX)
551 [[nodiscard]] explicit numeric_array(__m256 const& rhs) noexcept requires(is_f32x8)
552 {
553 _mm256_storeu_ps(v.data(), rhs);
554 }
555#endif
556
557#if defined(HI_HAS_AVX)
558 [[nodiscard]] explicit numeric_array(__m256d const& rhs) noexcept requires(is_f64x4)
559 {
560 _mm256_storeu_pd(v.data(), rhs);
561 }
562#endif
563
564#if defined(HI_HAS_AVX)
565 numeric_array& operator=(__m256i const& rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
566 {
567 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), rhs);
568 return *this;
569 }
570#endif
571
572#if defined(HI_HAS_AVX)
573 numeric_array& operator=(__m256 const& rhs) noexcept requires(is_f32x8)
574 {
575 _mm256_storeu_ps(v.data(), rhs);
576 return *this;
577 }
578#endif
579
580#if defined(HI_HAS_AVX)
581 numeric_array& operator=(__m256d const& rhs) noexcept requires(is_f64x4)
582 {
583 _mm256_storeu_pd(v.data(), rhs);
584 return *this;
585 }
586#endif
587
588 template<typename Other>
589 [[nodiscard]] constexpr friend Other bit_cast(numeric_array const& rhs) noexcept
590 requires(sizeof(Other) == sizeof(container_type))
591 {
592 if (not std::is_constant_evaluated()) {
593#if defined(HI_HAS_SSE2)
594 if constexpr (Other::is_f32x4 and std::is_integral_v<T>) {
595 return Other{_mm_castsi128_ps(rhs.reg())};
596 } else if constexpr (Other::is_f32x4 and is_f64x2) {
597 return Other{_mm_castpd_ps(rhs.reg())};
598 } else if constexpr (Other::is_f64x2 and std::is_integral_v<T>) {
599 return Other{_mm_castsi128_pd(rhs.reg())};
600 } else if constexpr (Other::is_f64x2 and is_f32x4) {
601 return Other{_mm_castps_pd(rhs.reg())};
602 } else if constexpr (std::is_integral_v<typename Other::value_type> and is_f32x4) {
603 return Other{_mm_castps_si128(rhs.reg())};
604 } else if constexpr (std::is_integral_v<typename Other::value_type> and is_f64x2) {
605 return Other{_mm_castpd_si128(rhs.reg())};
606 } else if constexpr (std::is_integral_v<typename Other::value_type> and std::is_integral_v<T>) {
607 return Other{rhs.reg()};
608 }
609#endif
610 }
611 return std::bit_cast<Other>(rhs);
612 }
613
617 {
618 if (not std::is_constant_evaluated()) {
619#if defined(HI_HAS_SSE2)
620 if constexpr (is_f64x2) {
621 return numeric_array{_mm_unpacklo_pd(a.reg(), b.reg())};
622 } else if constexpr (is_i64x2 or is_u64x2) {
623 return numeric_array{_mm_unpacklo_epi64(a.reg(), b.reg())};
624 } else if constexpr (is_i32x4 or is_u32x4) {
625 return numeric_array{_mm_unpacklo_epi32(a.reg(), b.reg())};
626 } else if constexpr (is_i16x8 or is_u16x8) {
627 return numeric_array{_mm_unpacklo_epi16(a.reg(), b.reg())};
628 } else if constexpr (is_i8x16 or is_u8x16) {
629 return numeric_array{_mm_unpacklo_epi8(a.reg(), b.reg())};
630 }
631#endif
632#if defined(HI_HAS_SSE)
633 if constexpr (is_f32x4) {
634 return numeric_array{_mm_unpacklo_ps(a.reg(), b.reg())};
635 }
636#endif
637 }
638
639 auto r = numeric_array{};
640 for (std::size_t i = 0; i != N; ++i) {
641 r[i] = (i % 2 == 0) ? a[i / 2] : b[i / 2];
642 }
643 return r;
644 }
645
650 template<std::size_t S>
651 [[nodiscard]] static constexpr numeric_array load(std::byte const *ptr) noexcept
652 {
653 auto r = numeric_array{};
654 std::memcpy(&r, ptr, S);
655 return r;
656 }
657
662 [[nodiscard]] static constexpr numeric_array load(std::byte const *ptr) noexcept
663 {
664 auto r = numeric_array{};
665 std::memcpy(&r, ptr, sizeof(r));
666 return r;
667 }
668
673 [[nodiscard]] static constexpr numeric_array load(T const *ptr) noexcept
674 {
675 auto r = numeric_array{};
676 std::memcpy(&r, ptr, sizeof(r));
677 return r;
678 }
679
680 template<std::size_t S>
681 constexpr void store(std::byte *ptr) const noexcept
682 {
683 std::memcpy(ptr, this, S);
684 }
685
689 constexpr void store(std::byte *ptr) const noexcept
690 {
691 store<sizeof(*this)>(ptr);
692 }
693
697 constexpr explicit operator bool() const noexcept
698 {
699 if constexpr (std::is_floating_point_v<T>) {
700 hilet ep = epsilon();
701 // check if any of the elements is outside of epsilon range,
702 return to_bool(gt(-ep, *this) | gt(*this, ep));
703 } else {
704 return to_bool(ne(*this, T{0}));
705 }
706 }
707
708 [[nodiscard]] constexpr T const& operator[](std::size_t i) const noexcept
709 {
710 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
711 hi_axiom(i < N);
712 return v[i];
713 }
714
715 [[nodiscard]] constexpr T& operator[](std::size_t i) noexcept
716 {
717 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
718 hi_axiom(i < N);
719 return v[i];
720 }
721
722 [[nodiscard]] constexpr reference front() noexcept
723 {
724 return v.front();
725 }
726
727 [[nodiscard]] constexpr const_reference front() const noexcept
728 {
729 return v.front();
730 }
731
732 [[nodiscard]] constexpr reference back() noexcept
733 {
734 return v.back();
735 }
736
737 [[nodiscard]] constexpr const_reference back() const noexcept
738 {
739 return v.back();
740 }
741
742 [[nodiscard]] constexpr pointer data() noexcept
743 {
744 return v.data();
745 }
746
747 [[nodiscard]] constexpr const_pointer data() const noexcept
748 {
749 return v.data();
750 }
751
752 [[nodiscard]] constexpr iterator begin() noexcept
753 {
754 return v.begin();
755 }
756
757 [[nodiscard]] constexpr const_iterator begin() const noexcept
758 {
759 return v.begin();
760 }
761
762 [[nodiscard]] constexpr const_iterator cbegin() const noexcept
763 {
764 return v.cbegin();
765 }
766
767 [[nodiscard]] constexpr iterator end() noexcept
768 {
769 return v.end();
770 }
771
772 [[nodiscard]] constexpr const_iterator end() const noexcept
773 {
774 return v.end();
775 }
776
777 [[nodiscard]] constexpr const_iterator cend() const noexcept
778 {
779 return v.cend();
780 }
781
782 [[nodiscard]] constexpr bool empty() const noexcept
783 {
784 return v.empty();
785 }
786
787 [[nodiscard]] constexpr size_type size() const noexcept
788 {
789 return v.size();
790 }
791
792 [[nodiscard]] constexpr size_type max_size() const noexcept
793 {
794 return v.max_size();
795 }
796
797 constexpr bool is_point() const noexcept
798 {
799 return v.back() != T{};
800 }
801
802 constexpr bool is_vector() const noexcept
803 {
804 return v.back() == T{};
805 }
806
807 constexpr bool is_opaque() const noexcept
808 {
809 return a() == T{1};
810 }
811
812 constexpr bool is_transparent() const noexcept
813 {
814 return a() == T{0};
815 }
816
817 [[nodiscard]] constexpr T const& x() const noexcept requires(N >= 1)
818 {
819 return std::get<0>(v);
820 }
821
822 [[nodiscard]] constexpr T const& y() const noexcept requires(N >= 2)
823 {
824 return std::get<1>(v);
825 }
826
827 [[nodiscard]] constexpr T const& z() const noexcept requires(N >= 3)
828 {
829 return std::get<2>(v);
830 }
831
832 [[nodiscard]] constexpr T const& w() const noexcept requires(N >= 4)
833 {
834 return std::get<3>(v);
835 }
836
837 [[nodiscard]] constexpr T& x() noexcept requires(N >= 1)
838 {
839 return std::get<0>(v);
840 }
841
842 [[nodiscard]] constexpr T& y() noexcept requires(N >= 2)
843 {
844 return std::get<1>(v);
845 }
846
847 [[nodiscard]] constexpr T& z() noexcept requires(N >= 3)
848 {
849 return std::get<2>(v);
850 }
851
852 [[nodiscard]] constexpr T& w() noexcept requires(N >= 4)
853 {
854 return std::get<3>(v);
855 }
856
857 [[nodiscard]] constexpr T const& r() const noexcept requires(N >= 1)
858 {
859 return std::get<0>(v);
860 }
861
862 [[nodiscard]] constexpr T const& g() const noexcept requires(N >= 2)
863 {
864 return std::get<1>(v);
865 }
866
867 [[nodiscard]] constexpr T const& b() const noexcept requires(N >= 3)
868 {
869 return std::get<2>(v);
870 }
871
872 [[nodiscard]] constexpr T const& a() const noexcept requires(N >= 4)
873 {
874 return std::get<3>(v);
875 }
876
877 [[nodiscard]] constexpr T& r() noexcept requires(N >= 1)
878 {
879 return std::get<0>(v);
880 }
881
882 [[nodiscard]] constexpr T& g() noexcept requires(N >= 2)
883 {
884 return std::get<1>(v);
885 }
886
887 [[nodiscard]] constexpr T& b() noexcept requires(N >= 3)
888 {
889 return std::get<2>(v);
890 }
891
892 [[nodiscard]] constexpr T& a() noexcept requires(N >= 4)
893 {
894 return std::get<3>(v);
895 }
896
897 [[nodiscard]] constexpr T const& width() const noexcept requires(N >= 1)
898 {
899 return std::get<0>(v);
900 }
901
902 [[nodiscard]] constexpr T const& height() const noexcept requires(N >= 2)
903 {
904 return std::get<1>(v);
905 }
906
907 [[nodiscard]] constexpr T const& depth() const noexcept requires(N >= 3)
908 {
909 return std::get<2>(v);
910 }
911
912 [[nodiscard]] constexpr T& width() noexcept requires(N >= 1)
913 {
914 return std::get<0>(v);
915 }
916
917 [[nodiscard]] constexpr T& height() noexcept requires(N >= 2)
918 {
919 return std::get<1>(v);
920 }
921
922 [[nodiscard]] constexpr T& depth() noexcept requires(N >= 3)
923 {
924 return std::get<2>(v);
925 }
926
927 constexpr numeric_array& operator<<=(unsigned int rhs) noexcept
928 {
929 return *this = *this << rhs;
930 }
931
932 constexpr numeric_array& operator>>=(unsigned int rhs) noexcept
933 {
934 return *this = *this >> rhs;
935 }
936
937 constexpr numeric_array& operator|=(numeric_array const& rhs) noexcept
938 {
939 return *this = *this | rhs;
940 }
941
942 constexpr numeric_array& operator|=(T const& rhs) noexcept
943 {
944 return *this = *this | rhs;
945 }
946
947 constexpr numeric_array& operator&=(numeric_array const& rhs) noexcept
948 {
949 return *this = *this & rhs;
950 }
951
952 constexpr numeric_array& operator&=(T const& rhs) noexcept
953 {
954 return *this = *this & rhs;
955 }
956
957 constexpr numeric_array& operator^=(numeric_array const& rhs) noexcept
958 {
959 return *this = *this ^ rhs;
960 }
961
962 constexpr numeric_array& operator^=(T const& rhs) noexcept
963 {
964 return *this = *this ^ rhs;
965 }
966
967 constexpr numeric_array& operator+=(numeric_array const& rhs) noexcept
968 {
969 return *this = *this + rhs;
970 }
971
972 constexpr numeric_array& operator+=(T const& rhs) noexcept
973 {
974 return *this = *this + rhs;
975 }
976
977 constexpr numeric_array& operator-=(numeric_array const& rhs) noexcept
978 {
979 return *this = *this - rhs;
980 }
981
982 constexpr numeric_array& operator-=(T const& rhs) noexcept
983 {
984 return *this = *this - rhs;
985 }
986
987 constexpr numeric_array& operator*=(numeric_array const& rhs) noexcept
988 {
989 return *this = *this * rhs;
990 }
991
992 constexpr numeric_array& operator*=(T const& rhs) noexcept
993 {
994 return *this = *this * rhs;
995 }
996
997 constexpr numeric_array& operator/=(numeric_array const& rhs) noexcept
998 {
999 return *this = *this / rhs;
1000 }
1001
1002 constexpr numeric_array& operator/=(T const& rhs) noexcept
1003 {
1004 return *this = *this / rhs;
1005 }
1006
1007 constexpr numeric_array& operator%=(numeric_array const& rhs) noexcept
1008 {
1009 return *this = *this % rhs;
1010 }
1011
1012 constexpr numeric_array& operator%=(T const& rhs) noexcept
1013 {
1014 return *this = *this % rhs;
1015 }
1016
1017 constexpr static ssize_t get_zero = -1;
1018 constexpr static ssize_t get_one = -2;
1019
1024 template<std::size_t I>
1025 [[nodiscard]] friend constexpr T& get(numeric_array& rhs) noexcept
1026 {
1027 static_assert(I < N, "Index out of bounds");
1028 return std::get<I>(rhs.v);
1029 }
1030
1036 template<ssize_t I>
1037 [[nodiscard]] friend constexpr T get(numeric_array&& rhs) noexcept
1038 {
1039 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
1040 static_assert(I >= -2 && I < narrow_cast<ssize_t>(N), "Index out of bounds");
1041 if constexpr (I == get_zero) {
1042 return T{0};
1043 } else if constexpr (I == get_one) {
1044 return T{1};
1045 } else {
1046 return std::get<I>(rhs.v);
1047 }
1048 }
1049
1056 template<std::size_t I>
1057 [[nodiscard]] constexpr friend T extract(numeric_array const& rhs) noexcept
1058 {
1059 static_assert(I < N);
1060
1061 if (not std::is_constant_evaluated()) {
1062#if defined(HI_HAS_AVX2)
1063 if constexpr (is_i16x16 or is_u16x16) {
1064 return static_cast<T>(_mm256_extract_epi16(rhs.v.reg(), I));
1065 } else if constexpr (is_i8x32 or is_u8x32) {
1066 return static_cast<T>(_mm256_extract_epi8(rhs.v.reg(), I));
1067 }
1068#endif
1069#if defined(HI_HAS_AVX)
1070 if constexpr (is_f64x4) {
1071 return bit_cast<T>(_mm256_extract_epi64(_mm256_castpd_si256(rhs.v.reg()), I));
1072 } else if constexpr (is_f32x8) {
1073 return bit_cast<T>(_mm256_extract_epi32(_mm256_castps_si256(rhs.v.reg()), I));
1074 } else if constexpr (is_i64x4 or is_u64x4) {
1075 return static_cast<T>(_mm256_extract_epi64(rhs.v.reg(), I));
1076 } else if constexpr (is_i32x8 or is_u32x8) {
1077 return static_cast<T>(_mm256_extract_epi32(rhs.v.reg(), I));
1078 }
1079#endif
1080#if defined(HI_HAS_SSE4_1)
1081 if constexpr (is_f64x2) {
1082 return bit_cast<T>(_mm_extract_epi64(_mm_castpd_si128(rhs.v.reg()), I));
1083 } else if constexpr (is_f32x4) {
1084 return std::bit_cast<T>(_mm_extract_ps(rhs.v.reg(), I));
1085 } else if constexpr (is_i64x2 or is_u64x2) {
1086 return static_cast<T>(_mm_extract_epi64(rhs.v.reg(), I));
1087 } else if constexpr (is_i32x4 or is_u32x4) {
1088 return static_cast<T>(_mm_extract_epi32(rhs.v.reg(), I));
1089 } else if constexpr (is_i8x16 or is_u8x16) {
1090 return static_cast<T>(_mm_extract_epi8(rhs.v.reg(), I));
1091 }
1092#endif
1093#if defined(HI_HAS_SSE2)
1094 if constexpr (is_i16x8 or is_u16x8) {
1095 return static_cast<T>(_mm_extract_epi16(rhs.v.reg(), I));
1096 }
1097#endif
1098 }
1099
1100 return get<I>(rhs);
1101 }
1102
1111 template<std::size_t I, std::size_t ZeroMask = 0>
1112 [[nodiscard]] constexpr friend numeric_array insert(numeric_array const& lhs, T rhs) noexcept
1113 requires(is_f32x4 or is_i32x4 or is_u32x4)
1114 {
1115 static_assert(I < N);
1116 static_assert(ZeroMask <= ((1 << N) - 1));
1117
1118 if (not std::is_constant_evaluated()) {
1119#if defined(HI_HAS_SSE4_1)
1120 if constexpr (is_f32x4) {
1121 constexpr int imm8 = (I << 4) | ZeroMask;
1122 return numeric_array{_mm_insert_ps(lhs.reg(), _mm_set_ss(rhs), imm8)};
1123 } else if constexpr (is_i32x4 or is_u32x4) {
1124 constexpr int imm8 = (I << 4) | ZeroMask;
1125 return numeric_array{
1126 _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(_mm_set1_epi32(rhs)), imm8))};
1127 }
1128#endif
1129 }
1130
1131 auto r = lhs;
1132 std::get<I>(r.v) = rhs;
1133 for (std::size_t i = 0; i != N; ++i) {
1134 if ((ZeroMask >> i) & 1) {
1135 r.v[i] = T{};
1136 }
1137 }
1138 return r;
1139 }
1140
1146 template<ssize_t I>
1147 [[nodiscard]] friend constexpr T get(numeric_array const& rhs) noexcept
1148 {
1149 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
1150 static_assert(I >= -2 && I < narrow_cast<ssize_t>(N), "Index out of bounds");
1151 if constexpr (I == get_zero) {
1152 return T{0};
1153 } else if constexpr (I == get_one) {
1154 return T{1};
1155 } else {
1156 return std::get<I>(rhs.v);
1157 }
1158 }
1159
1164 template<std::size_t Mask = ~std::size_t{0}>
1165 [[nodiscard]] friend constexpr numeric_array zero(numeric_array rhs) noexcept
1166 {
1167 if (not std::is_constant_evaluated()) {
1168#if defined(HI_HAS_SSE4_1)
1169 if constexpr (is_f32x4) {
1170 return numeric_array{_mm_insert_ps(rhs.reg(), rhs.reg(), Mask)};
1171 } else if constexpr (is_i32x4 or is_u32x4) {
1172 return numeric_array{
1173 _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(rhs.reg()), _mm_castsi128_ps(rhs.reg()), Mask))};
1174 }
1175#endif
1176 }
1177
1178 auto r = numeric_array{};
1179 for (std::size_t i = 0; i != N; ++i) {
1180 if (to_bool((Mask >> i) & 1)) {
1181 r.v[i] = T{0};
1182 } else {
1183 r.v[i] = rhs.v[i];
1184 }
1185 }
1186 return r;
1187 }
1188
1196 template<std::size_t Mask>
1197 [[nodiscard]] friend constexpr numeric_array blend(numeric_array const& lhs, numeric_array const& rhs) noexcept
1198 {
1199 if (not std::is_constant_evaluated()) {
1200#if defined(HI_HAS_AVX2)
1201 if constexpr (is_i32x8) {
1202 return numeric_array{_mm256_blend_epi32(lhs.reg(), rhs.reg(), Mask)};
1203 } else if constexpr (is_i64x2 or is_u64x2) {
1204 constexpr auto mask_x2 = ((Mask & 1) ? 0b0011 : 0) | ((Mask & 2) ? 0b1100 : 0);
1205 return numeric_array{_mm_blend_epi32(lhs.reg(), rhs.reg(), mask_x2)};
1206 } else if constexpr (is_i32x4 or is_u32x4) {
1207 return numeric_array{_mm_blend_epi32(lhs.reg(), rhs.reg(), Mask)};
1208 } else if constexpr (is_i16x16 or is_u16x16) {
1209 return numeric_array{_mm256_blend_epi16(lhs.reg(), rhs.reg(), Mask)};
1210 }
1211#endif
1212#if defined(HI_HAS_AVX)
1213 if constexpr (is_f64x4) {
1214 return numeric_array{_mm256_blend_pd(lhs.reg(), rhs.reg(), Mask)};
1215 } else if constexpr (is_f32x8) {
1216 return numeric_array{_mm256_blend_ps(lhs.reg(), rhs.reg(), Mask)};
1217 } else if constexpr (is_i64x4 or is_u64x4) {
1218 return numeric_array{
1219 _mm256_castpd_si256(_mm256_blend_pd(_mm256_castsi256_pd(lhs.reg()), _mm256_castsi256_pd(rhs.reg()), Mask))};
1220 } else if constexpr (is_i32x8 or is_u32x8) {
1221 return numeric_array{
1222 _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg()), Mask))};
1223 }
1224#endif
1225#if defined(HI_HAS_SSE4_1)
1226 if constexpr (is_f64x2) {
1227 return numeric_array{_mm_blend_pd(lhs.reg(), rhs.reg(), Mask)};
1228 } else if constexpr (is_f32x4) {
1229 return numeric_array{_mm_blend_ps(lhs.reg(), rhs.reg(), Mask)};
1230 } else if constexpr (is_i64x2 or is_u64x2) {
1231 return numeric_array{
1232 _mm_castpd_si128(_mm_blend_pd(_mm_castsi128_pd(lhs.reg()), _mm_castsi128_pd(rhs.reg()), Mask))};
1233 } else if constexpr (is_i32x4 or is_u32x4) {
1234 return numeric_array{
1235 _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg()), Mask))};
1236 } else if constexpr (is_i16x8 or is_u16x8) {
1237 return numeric_array{_mm_blend_epi16(lhs.reg(), rhs.reg(), Mask)};
1238 }
1239#endif
1240 }
1241
1242 auto r = numeric_array{};
1243 for (std::size_t i = 0; i != N; ++i) {
1244 r[i] = to_bool((Mask >> i) & 1) ? rhs[i] : lhs[i];
1245 }
1246 return r;
1247 }
1248
1251 [[nodiscard]] friend constexpr numeric_array blend(numeric_array const& a, numeric_array const& b, numeric_array const& mask)
1252 {
1253 if (not std::is_constant_evaluated()) {
1254#if defined(HI_HAS_AVX2)
1255 if constexpr (is_i8x32 or is_u8x32) {
1256 return numeric_array{_mm256_blendv_epi8(a.reg(), b.reg(), mask.reg())};
1257 }
1258#endif
1259#if defined(HI_HAS_AVX)
1260 if constexpr (is_f64x4) {
1261 return numeric_array{_mm256_blendv_pd(a.reg(), b.reg(), mask.reg())};
1262 } else if constexpr (is_f32x8) {
1263 return numeric_array{_mm256_blendv_ps(a.reg(), b.reg(), mask.reg())};
1264 } else if constexpr (is_i64x4 or is_u64x4) {
1265 return numeric_array{_mm256_castpd_si256(_mm256_blendv_pd(
1266 _mm256_castsi256_pd(a.reg()), _mm256_castsi256_pd(b.reg()), _mm256_castsi256_pd(mask.reg())))};
1267 } else if constexpr (is_i32x8 or is_u32x8) {
1268 return numeric_array{_mm256_castps_si256(_mm256_blendv_ps(
1269 _mm256_castsi256_ps(a.reg()), _mm256_castsi256_ps(b.reg()), _mm256_castsi256_ps(mask.reg())))};
1270 }
1271#endif
1272#if defined(HI_HAS_SSE4_1)
1273 if constexpr (is_f64x2) {
1274 return numeric_array{_mm_blendv_pd(a.reg(), b.reg(), mask.reg())};
1275 } else if constexpr (is_f32x4) {
1276 return numeric_array{_mm_blendv_ps(a.reg(), b.reg(), mask.reg())};
1277 } else if constexpr (is_i64x2 or is_u64x2) {
1278 return numeric_array{_mm_castpd_si128(
1279 _mm_blendv_pd(_mm_castsi128_pd(a.reg()), _mm_castsi128_pd(b.reg()), _mm_castsi128_pd(mask.reg())))};
1280 } else if constexpr (is_i32x4 or is_u32x4) {
1281 return numeric_array{_mm_castps_si128(
1282 _mm_blendv_ps(_mm_castsi128_ps(a.reg()), _mm_castsi128_ps(b.reg()), _mm_castsi128_ps(mask.reg())))};
1283 } else if constexpr (is_i8x16 or is_u8x16) {
1284 return numeric_array{_mm_blendv_epi8(a.reg(), b.reg(), mask.reg())};
1285 }
1286#endif
1287 }
1288
1289 auto r = numeric_array{};
1290 for (std::size_t i = 0; i != N; ++i) {
1291 r[i] = mask[i] != T{0} ? b[i] : a[i];
1292 }
1293 return r;
1294 }
1295
1300 template<std::size_t Mask>
1301 [[nodiscard]] friend constexpr numeric_array neg(numeric_array rhs) noexcept
1302 {
1303 return blend<Mask>(rhs, -rhs);
1304 }
1305
1306 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const& rhs) noexcept
1307 {
1308 return T{0} - rhs;
1309 }
1310
1311 [[nodiscard]] friend constexpr numeric_array abs(numeric_array const& rhs) noexcept
1312 {
1313 if (not std::is_constant_evaluated()) {
1314#if defined(HI_HAS_AVX2)
1315 if constexpr (is_i32x8) {
1316 return numeric_array{_mm256_abs_epi32(rhs.reg())};
1317 } else if constexpr (is_i16x16) {
1318 return numeric_array{_mm256_abs_epi16(rhs.reg())};
1319 } else if constexpr (is_i8x32) {
1320 return numeric_array{_mm256_abs_epi8(rhs.reg())};
1321 }
1322#endif
1323#if defined(HI_HAS_SSSE3)
1324 if constexpr (is_i32x4) {
1325 return numeric_array{_mm_abs_epi32(rhs.reg())};
1326 } else if constexpr (is_i16x8) {
1327 return numeric_array{_mm_abs_epi16(rhs.reg())};
1328 } else if constexpr (is_i8x16) {
1329 return numeric_array{_mm_abs_epi8(rhs.reg())};
1330 }
1331#endif
1332#if defined(HI_HAS_SSE2)
1333 if constexpr (is_f64x2) {
1334 return numeric_array{_mm_castsi128_ps(_mm_srli_epi64(_mm_slli_epi64(_mm_castpd_si128(rhs.reg()), 1), 1))};
1335 } else if constexpr (is_f32x4) {
1336 return numeric_array{_mm_castsi128_ps(_mm_srli_epi32(_mm_slli_epi32(_mm_castps_si128(rhs.reg()), 1), 1))};
1337 }
1338#endif
1339 }
1340
1341 return max(rhs, -rhs);
1342 }
1343
1344 [[nodiscard]] friend constexpr numeric_array rcp(numeric_array const& rhs) noexcept
1345 {
1346 if (not std::is_constant_evaluated()) {
1347#if defined(HI_HAS_AVX)
1348 if constexpr (is_f32x8) {
1349 return numeric_array{_mm256_rcp_ps(rhs.reg())};
1350 }
1351#endif
1352#if defined(HI_HAS_SSE)
1353 if constexpr (is_f32x4) {
1354 return numeric_array{_mm_rcp_ps(rhs.reg())};
1355 }
1356#endif
1357 }
1358
1359 return T{1} / rhs;
1360 }
1361
1362 [[nodiscard]] friend constexpr numeric_array sqrt(numeric_array const& rhs) noexcept
1363 {
1364 if (not std::is_constant_evaluated()) {
1365#if defined(HI_HAS_AVX)
1366 if constexpr (is_f64x4) {
1367 return numeric_array{_mm256_sqrt_pd(rhs.reg())};
1368 } else if constexpr (is_f32x8) {
1369 return numeric_array{_mm256_sqrt_ps(rhs.reg())};
1370 }
1371#endif
1372#if defined(HI_HAS_SSE2)
1373 if constexpr (is_f64x2) {
1374 return numeric_array{_mm_sqrt_pd(rhs.reg())};
1375 }
1376#endif
1377#if defined(HI_HAS_SSE)
1378 if constexpr (is_f32x4) {
1379 return numeric_array{_mm_sqrt_ps(rhs.reg())};
1380 }
1381#endif
1382 }
1383
1384 auto r = numeric_array{};
1385 for (std::size_t i = 0; i != N; ++i) {
1386 r[i] = std::sqrt(rhs.v[i]);
1387 }
1388 return r;
1389 }
1390
1391 [[nodiscard]] friend constexpr numeric_array rcp_sqrt(numeric_array const& rhs) noexcept
1392 {
1393 if (not std::is_constant_evaluated()) {
1394#if defined(HI_HAS_AVX)
1395 if constexpr (is_f32x8) {
1396 return numeric_array{_mm256_rsqrt_ps(rhs.reg())};
1397 }
1398#endif
1399#if defined(HI_HAS_SSE)
1400 if constexpr (is_f32x4) {
1401 return numeric_array{_mm_rsqrt_ps(rhs.reg())};
1402 }
1403#endif
1404 }
1405
1406 return rcp(sqrt(rhs));
1407 }
1408
1409 [[nodiscard]] friend constexpr numeric_array floor(numeric_array const& rhs) noexcept
1410 requires(std::is_floating_point_v<value_type>)
1411 {
1412 if (not std::is_constant_evaluated()) {
1413#if defined(HI_HAS_AVX)
1414 if constexpr (is_f64x4) {
1415 return numeric_array{_mm256_floor_pd(rhs.reg())};
1416 } else if constexpr (is_f32x8) {
1417 return numeric_array{_mm256_floor_ps(rhs.reg())};
1418 }
1419#endif
1420#if defined(HI_HAS_SSE4_1)
1421 if constexpr (is_f64x2) {
1422 return numeric_array{_mm_floor_pd(rhs.reg())};
1423 } else if constexpr (is_f32x4) {
1424 return numeric_array{_mm_floor_ps(rhs.reg())};
1425 }
1426#endif
1427 }
1428
1429 auto r = numeric_array{};
1430 for (std::size_t i = 0; i != N; ++i) {
1431 r[i] = std::floor(rhs.v[i]);
1432 }
1433 return r;
1434 }
1435
1436 [[nodiscard]] friend constexpr numeric_array ceil(numeric_array const& rhs) noexcept
1437 requires(std::is_floating_point_v<value_type>)
1438 {
1439 if (not std::is_constant_evaluated()) {
1440#if defined(HI_HAS_AVX)
1441 if constexpr (is_f64x4) {
1442 return numeric_array{_mm256_ceil_pd(rhs.reg())};
1443 } else if constexpr (is_f32x8) {
1444 return numeric_array{_mm256_ceil_ps(rhs.reg())};
1445 }
1446#endif
1447#if defined(HI_HAS_SSE4_1)
1448 if constexpr (is_f64x2) {
1449 return numeric_array{_mm_ceil_pd(rhs.reg())};
1450 } else if constexpr (is_f32x4) {
1451 return numeric_array{_mm_ceil_ps(rhs.reg())};
1452 }
1453#endif
1454 }
1455
1456 auto r = numeric_array{};
1457 for (std::size_t i = 0; i != N; ++i) {
1458 r[i] = std::ceil(rhs.v[i]);
1459 }
1460 return r;
1461 }
1462
1463 [[nodiscard]] friend constexpr numeric_array round(numeric_array const& rhs) noexcept
1464 requires(std::is_floating_point_v<value_type>)
1465 {
1466 if (not std::is_constant_evaluated()) {
1467#if defined(HI_HAS_AVX)
1468 if constexpr (is_f64x4) {
1469 return numeric_array{_mm256_round_pd(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1470 } else if constexpr (is_f32x8) {
1471 return numeric_array{_mm256_round_ps(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1472 }
1473#endif
1474#if defined(HI_HAS_SSE4_1)
1475 if constexpr (is_f64x2) {
1476 return numeric_array{_mm_round_pd(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1477 } else if constexpr (is_f32x4) {
1478 return numeric_array{_mm_round_ps(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1479 }
1480#endif
1481 }
1482
1483 auto r = numeric_array{};
1484 for (std::size_t i = 0; i != N; ++i) {
1485 r[i] = std::round(rhs.v[i]);
1486 }
1487 return r;
1488 }
1489
1497 template<std::size_t Mask>
1498 [[nodiscard]] hi_force_inline friend constexpr T dot(numeric_array const& lhs, numeric_array const& rhs) noexcept
1499 {
1500 if (not std::is_constant_evaluated()) {
1501#if defined(HI_HAS_SSE4_1)
1502 if constexpr (is_f64x2) {
1503 return std::bit_cast<double>(_mm_extract_epi64(_mm_dp_pd(lhs.reg(), rhs.reg(), (Mask << 4) | 0xf), 0));
1504 } else if constexpr (is_f32x4) {
1505 return std::bit_cast<float>(_mm_extract_ps(_mm_dp_ps(lhs.reg(), rhs.reg(), (Mask << 4) | 0xf), 0));
1506 }
1507#endif
1508 }
1509
1510 auto r = T{};
1511 for (std::size_t i = 0; i != N; ++i) {
1512 if (to_bool(Mask & (1_uz << i))) {
1513 r += lhs.v[i] * rhs.v[i];
1514 }
1515 }
1516 return r;
1517 }
1518
1525 template<std::size_t Mask>
1526 [[nodiscard]] friend T hypot(numeric_array const& rhs) noexcept requires (std::is_floating_point_v<value_type>)
1527 {
1528 return std::sqrt(dot<Mask>(rhs, rhs));
1529 }
1530
1537 template<std::size_t Mask>
1538 [[nodiscard]] hi_force_inline friend constexpr T squared_hypot(numeric_array const& rhs) noexcept
1539 {
1540 return dot<Mask>(rhs, rhs);
1541 }
1542
1548 template<std::size_t Mask>
1549 [[nodiscard]] friend constexpr T rcp_hypot(numeric_array const& rhs) noexcept
1550 {
1551 if (not std::is_constant_evaluated()) {
1552#if defined(HI_HAS_SSE4_1)
1553 if constexpr (is_f32x4) {
1554 return std::bit_cast<float>(_mm_extract_ps(_mm_rsqrt_ps(_mm_dp_ps(rhs.reg(), rhs.reg(), (Mask << 4) | 0xf)), 0));
1555 }
1556#endif
1557 }
1558
1559 return 1.0f / hypot<Mask>(rhs);
1560 }
1561
1569 template<std::size_t Mask>
1570 [[nodiscard]] friend constexpr numeric_array normalize(numeric_array const& rhs) noexcept
1571 {
1572 hi_axiom(rhs.is_vector());
1573
1574 if (not std::is_constant_evaluated()) {
1575#if defined(HI_HAS_SSE4_1)
1576 if constexpr (is_f32x4) {
1577 hilet rhs_ = rhs.reg();
1578 hilet tmp = _mm_mul_ps(_mm_rsqrt_ps(_mm_dp_ps(rhs_, rhs_, (Mask << 4) | 0xf)), rhs_);
1579 return numeric_array{_mm_insert_ps(tmp, tmp, ~Mask & 0xf)};
1580 }
1581#endif
1582 }
1583
1584 hilet rcp_hypot_ = rcp_hypot<Mask>(rhs);
1585
1586 auto r = numeric_array{};
1587 for (std::size_t i = 0; i != N; ++i) {
1588 if (to_bool(Mask & (1_uz << i))) {
1589 r.v[i] = rhs.v[i] * rcp_hypot_;
1590 }
1591 }
1592 return r;
1593 }
1594
1595 [[nodiscard]] friend constexpr std::size_t eq(numeric_array const& lhs, numeric_array const& rhs) noexcept
1596 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1597 {
1598 if (not std::is_constant_evaluated()) {
1599#if defined(HI_HAS_AVX2)
1600 if constexpr (is_i64x4 or is_u64x4) {
1601 return static_cast<std::size_t>(
1602 _mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(lhs.reg(), rhs.reg()))));
1603 } else if constexpr (is_i32x8 or is_u32x8) {
1604 return static_cast<std::size_t>(
1605 _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(lhs.reg(), rhs.reg()))));
1606 } else if constexpr (is_i8x32 or is_u8x32) {
1607 return static_cast<std::size_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(lhs.reg(), rhs.reg())));
1608 }
1609#endif
1610#if defined(HI_HAS_AVX)
1611 if constexpr (is_f64x4) {
1612 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_EQ_OQ)));
1613 } else if constexpr (is_f32x8) {
1614 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_EQ_OQ)));
1615 }
1616#endif
1617#if defined(HI_HAS_SSE4_1)
1618 if constexpr (is_i64x2 or is_u64x2) {
1619 return static_cast<std::size_t>(_mm_movemask_pd(_mm_castsi128_pd(_mm_cmpeq_epi64(lhs.reg(), rhs.reg()))));
1620 }
1621#endif
1622#if defined(HI_HAS_SSE2)
1623 if constexpr (is_f64x2) {
1624 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpeq_pd(lhs.reg(), rhs.reg())));
1625 } else if constexpr (is_i32x4 or is_u32x4) {
1626 return static_cast<std::size_t>(_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs.reg(), rhs.reg()))));
1627 } else if constexpr (is_i8x16 or is_u8x16) {
1628 return static_cast<std::size_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(lhs.reg(), rhs.reg())));
1629 }
1630#endif
1631#if defined(HI_HAS_SSE)
1632 if constexpr (is_f32x4) {
1633 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpeq_ps(lhs.reg(), rhs.reg())));
1634 }
1635#endif
1636 }
1637
1638 std::size_t r = 0;
1639 for (std::size_t i = 0; i != N; ++i) {
1640 r |= static_cast<std::size_t>(lhs.v[i] == rhs.v[i]) << i;
1641 }
1642 return r;
1643 }
1644
1645 [[nodiscard]] friend constexpr std::size_t ne(numeric_array const& lhs, numeric_array const& rhs) noexcept
1646 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1647 {
1648 if (not std::is_constant_evaluated()) {
1649#if defined(HI_HAS_AVX)
1650 if constexpr (is_f64x4) {
1651 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_NEQ_OQ)));
1652 } else if constexpr (is_f32x8) {
1653 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_NEQ_OQ)));
1654 }
1655#endif
1656#if defined(HI_HAS_SSE2)
1657 if constexpr (is_f64x2) {
1658 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpneq_pd(lhs.reg(), rhs.reg())));
1659 }
1660#endif
1661#if defined(HI_HAS_SSE)
1662 if constexpr (is_f32x4) {
1663 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpneq_ps(lhs.reg(), rhs.reg())));
1664 }
1665#endif
1666 }
1667
1668 constexpr std::size_t not_mask = (1 << N) - 1;
1669 return eq(lhs, rhs) ^ not_mask;
1670 }
1671
1672 [[nodiscard]] friend constexpr std::size_t gt(numeric_array const& lhs, numeric_array const& rhs) noexcept
1673 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1674 {
1675 if (not std::is_constant_evaluated()) {
1676#if defined(HI_HAS_AVX2)
1677 if constexpr (is_i64x4) {
1678 return static_cast<std::size_t>(
1679 _mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpgt_epi64(lhs.reg(), rhs.reg()))));
1680 } else if constexpr (is_i32x8) {
1681 return static_cast<std::size_t>(
1682 _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(lhs.reg(), rhs.reg()))));
1683 } else if constexpr (is_i8x32) {
1684 return static_cast<std::size_t>(_mm256_movemask_epi8(_mm256_cmpgt_epi8(lhs.reg(), rhs.reg())));
1685 }
1686#endif
1687#if defined(HI_HAS_AVX)
1688 if constexpr (is_f64x4) {
1689 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_GT_OQ)));
1690 } else if constexpr (is_f32x8) {
1691 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_GT_OQ)));
1692 }
1693#endif
1694#if defined(HI_HAS_SSE4_1)
1695 if constexpr (is_i64x2) {
1696 return static_cast<std::size_t>(_mm_movemask_pd(_mm_castsi128_pd(_mm_cmpgt_epi64(lhs.reg(), rhs.reg()))));
1697 }
1698#endif
1699#if defined(HI_HAS_SSE2)
1700 if constexpr (is_f64x2) {
1701 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpgt_pd(lhs.reg(), rhs.reg())));
1702 } else if constexpr (is_i32x4) {
1703 return static_cast<std::size_t>(_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(lhs.reg(), rhs.reg()))));
1704 } else if constexpr (is_i8x16) {
1705 return static_cast<std::size_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(lhs.reg(), rhs.reg())));
1706 }
1707#endif
1708#if defined(HI_HAS_SSE)
1709 if constexpr (is_f32x4) {
1710 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpgt_ps(lhs.reg(), rhs.reg())));
1711 }
1712#endif
1713 }
1714
1715 unsigned int r = 0;
1716 for (std::size_t i = 0; i != N; ++i) {
1717 r |= static_cast<std::size_t>(lhs.v[i] > rhs.v[i]) << i;
1718 }
1719 return r;
1720 }
1721
1722 [[nodiscard]] friend constexpr std::size_t lt(numeric_array const& lhs, numeric_array const& rhs) noexcept
1723 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1724 {
1725 if (not std::is_constant_evaluated()) {
1726#if defined(HI_HAS_AVX)
1727 if constexpr (is_f64x4) {
1728 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_LT_OQ)));
1729 } else if constexpr (is_f32x8) {
1730 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_LT_OQ)));
1731 }
1732#endif
1733#if defined(HI_HAS_SSE2)
1734 if constexpr (is_f64x2) {
1735 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmplt_pd(lhs.reg(), rhs.reg())));
1736 } else if constexpr (is_i32x4) {
1737 return static_cast<std::size_t>(_mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(lhs.reg(), rhs.reg()))));
1738 } else if constexpr (is_i8x16) {
1739 return static_cast<std::size_t>(_mm_movemask_epi8(_mm_cmplt_epi8(lhs.reg(), rhs.reg())));
1740 }
1741#endif
1742#if defined(HI_HAS_SSE)
1743 if constexpr (is_f32x4) {
1744 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmplt_ps(lhs.reg(), rhs.reg())));
1745 }
1746#endif
1747 }
1748
1749 // gt() and eq() has best x64 support.
1750 return gt(rhs, lhs);
1751 }
1752
1753 [[nodiscard]] friend constexpr std::size_t ge(numeric_array const& lhs, numeric_array const& rhs) noexcept
1754 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1755 {
1756 if (not std::is_constant_evaluated()) {
1757#if defined(HI_HAS_AVX)
1758 if constexpr (is_f64x4) {
1759 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_GE_OQ)));
1760 } else if constexpr (is_f32x8) {
1761 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_GE_OQ)));
1762 }
1763#endif
1764#if defined(HI_HAS_SSE2)
1765 if constexpr (is_f64x2) {
1766 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpge_pd(lhs.reg(), rhs.reg())));
1767 }
1768#endif
1769#if defined(HI_HAS_SSE)
1770 if constexpr (is_f32x4) {
1771 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpge_ps(lhs.reg(), rhs.reg())));
1772 }
1773#endif
1774 }
1775
1776 // gt() and eq() has best x64 support.
1777 return gt(lhs, rhs) | eq(lhs, rhs);
1778 }
1779
1780 [[nodiscard]] friend constexpr std::size_t le(numeric_array const& lhs, numeric_array const& rhs) noexcept
1781 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1782 {
1783 if (not std::is_constant_evaluated()) {
1784#if defined(HI_HAS_AVX)
1785 if constexpr (is_f64x4) {
1786 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_LE_OQ)));
1787 } else if constexpr (is_f32x8) {
1788 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_LE_OQ)));
1789 }
1790#endif
1791#if defined(HI_HAS_SSE2)
1792 if constexpr (is_f64x2) {
1793 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmple_pd(lhs.reg(), rhs.reg())));
1794 }
1795#endif
1796#if defined(HI_HAS_SSE)
1797 if constexpr (is_f32x4) {
1798 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmple_ps(lhs.reg(), rhs.reg())));
1799 }
1800#endif
1801 }
1802
1803 // gt() and eq() has best x64 support.
1804 return gt(rhs, lhs) | eq(rhs, lhs);
1805 }
1806
1807 [[nodiscard]] friend constexpr numeric_array gt_mask(numeric_array const& lhs, numeric_array const& rhs) noexcept
1808 {
1809 if (not std::is_constant_evaluated()) {
1810#if defined(HI_HAS_SSE4_2)
1811 if constexpr (is_i64x2) {
1812 return numeric_array{_mm_cmpgt_epi64(lhs.reg(), rhs.reg())};
1813 }
1814#endif
1815#if defined(HI_HAS_SSE2)
1816 if constexpr (is_i32x4) {
1817 return numeric_array{_mm_cmpgt_epi32(lhs.reg(), rhs.reg())};
1818 } else if constexpr (is_i16x8) {
1819 return numeric_array{_mm_cmpgt_epi16(lhs.reg(), rhs.reg())};
1820 } else if constexpr (is_i8x16) {
1821 return numeric_array{_mm_cmpgt_epi8(lhs.reg(), rhs.reg())};
1822 }
1823#endif
1824#if defined(HI_HAS_SSE)
1825 if constexpr (is_f32x4) {
1826 return numeric_array{_mm_cmpgt_ps(lhs.reg(), rhs.reg())};
1827 }
1828#endif
1829 }
1830
1831 using uint_type = make_uintxx_t<sizeof(T) * CHAR_BIT>;
1832 constexpr auto ones = std::bit_cast<T>(~uint_type{0});
1833
1834 auto r = numeric_array{};
1835 for (std::size_t i = 0; i != N; ++i) {
1836 r[i] = lhs.v[i] > rhs.v[i] ? ones : T{0};
1837 }
1838 return r;
1839 }
1840
1841 [[nodiscard]] friend constexpr bool operator==(numeric_array const& lhs, numeric_array const& rhs) noexcept
1842 {
1843 return not ne(lhs, rhs);
1844 }
1845
1846 [[nodiscard]] friend constexpr numeric_array operator<<(numeric_array const& lhs, unsigned int rhs) noexcept
1847 {
1848 if (not std::is_constant_evaluated()) {
1849#if defined(HI_HAS_AVX2)
1850 if constexpr (is_f64x4) {
1851 return numeric_array{_mm256_castsi256_pd(_mm256_slli_epi64(_mm256_castpd_si256(lhs.reg()), rhs))};
1852 } else if constexpr (is_f32x8) {
1853 return numeric_array{_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(lhs.reg()), rhs))};
1854 } else if constexpr (is_i64x4 or is_u64x4) {
1855 return numeric_array{_mm256_slli_epi64(lhs.reg(), rhs)};
1856 } else if constexpr (is_i32x8 or is_u32x8) {
1857 return numeric_array{_mm256_slli_epi32(lhs.reg(), rhs)};
1858 } else if constexpr (is_i16x16 or is_u16x16) {
1859 return numeric_array{_mm256_slli_epi16(lhs.reg(), rhs)};
1860 }
1861#endif
1862#if defined(HI_HAS_SSE2)
1863 if constexpr (is_f64x2) {
1864 return numeric_array{_mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(lhs.reg()), rhs))};
1865 } else if constexpr (is_f32x4) {
1866 return numeric_array{_mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(lhs.reg()), rhs))};
1867 } else if constexpr (is_i64x2 or is_u64x2) {
1868 return numeric_array{_mm_slli_epi64(lhs.reg(), rhs)};
1869 } else if constexpr (is_i32x4 or is_u32x4) {
1870 return numeric_array{_mm_slli_epi32(lhs.reg(), rhs)};
1871 } else if constexpr (is_i16x8 or is_u16x8) {
1872 return numeric_array{_mm_slli_epi16(lhs.reg(), rhs)};
1873 }
1874#endif
1875 }
1876
1877 auto r = numeric_array{};
1878 for (std::size_t i = 0; i != N; ++i) {
1879 r.v[i] = lhs.v[i] << rhs;
1880 }
1881 return r;
1882 }
1883
1884 [[nodiscard]] friend constexpr numeric_array operator>>(numeric_array const& lhs, unsigned int rhs) noexcept
1885 {
1886 if (not std::is_constant_evaluated()) {
1887#if defined(HI_HAS_AVX2)
1888 if constexpr (is_f64x4) {
1889 return numeric_array{_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_castpd_si256(lhs.reg()), rhs))};
1890 } else if constexpr (is_f32x8) {
1891 return numeric_array{_mm256_castsi256_ps(_mm256_srli_epi32(_mm256_castps_si256(lhs.reg()), rhs))};
1892 } else if constexpr (is_u64x4) {
1893 return numeric_array{_mm256_srli_epi64(lhs.reg(), rhs)};
1894 } else if constexpr (is_i32x8) {
1895 return numeric_array{_mm256_srai_epi32(lhs.reg(), rhs)};
1896 } else if constexpr (is_u32x8) {
1897 return numeric_array{_mm256_srli_epi32(lhs.reg(), rhs)};
1898 } else if constexpr (is_i16x16) {
1899 return numeric_array{_mm256_srai_epi16(lhs.reg(), rhs)};
1900 } else if constexpr (is_u16x16) {
1901 return numeric_array{_mm256_srli_epi16(lhs.reg(), rhs)};
1902 }
1903#endif
1904#if defined(HI_HAS_SSE2)
1905 if constexpr (is_f64x2) {
1906 return numeric_array{_mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(lhs.reg()), rhs))};
1907 } else if constexpr (is_f32x4) {
1908 return numeric_array{_mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(lhs.reg()), rhs))};
1909 } else if constexpr (is_u64x2) {
1910 return numeric_array{_mm_srli_epi64(lhs.reg(), rhs)};
1911 } else if constexpr (is_i32x4) {
1912 return numeric_array{_mm_srai_epi32(lhs.reg(), rhs)};
1913 } else if constexpr (is_u32x4) {
1914 return numeric_array{_mm_srli_epi32(lhs.reg(), rhs)};
1915 } else if constexpr (is_i16x8) {
1916 return numeric_array{_mm_srai_epi16(lhs.reg(), rhs)};
1917 } else if constexpr (is_u16x8) {
1918 return numeric_array{_mm_srli_epi16(lhs.reg(), rhs)};
1919 }
1920#endif
1921 }
1922
1923 auto r = numeric_array{};
1924 for (std::size_t i = 0; i != N; ++i) {
1925 r.v[i] = lhs.v[i] >> rhs;
1926 }
1927 return r;
1928 }
1929
1934 [[nodiscard]] friend constexpr numeric_array rotl(numeric_array const& lhs, unsigned int rhs) noexcept
1935 {
1936 hi_axiom(rhs > 0 and rhs < sizeof(value_type) * CHAR_BIT);
1937
1938 hilet remainder = narrow_cast<unsigned int>(sizeof(value_type) * CHAR_BIT - rhs);
1939
1940 return (lhs << rhs) | (lhs >> remainder);
1941 }
1942
1947 [[nodiscard]] friend constexpr numeric_array rotr(numeric_array const& lhs, unsigned int rhs) noexcept
1948 {
1949 hi_axiom(rhs > 0 and rhs < sizeof(value_type) * CHAR_BIT);
1950
1951 hilet remainder = narrow_cast<unsigned int>(sizeof(value_type) * CHAR_BIT - rhs);
1952
1953 return (lhs >> rhs) | (lhs << remainder);
1954 }
1955
1956 [[nodiscard]] friend constexpr numeric_array operator|(numeric_array const& lhs, numeric_array const& rhs) noexcept
1957 {
1958 if (not std::is_constant_evaluated()) {
1959#if defined(HI_HAS_AVX2)
1960 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
1961 return numeric_array{_mm256_or_si256(lhs.reg(), rhs.reg())};
1962 }
1963#endif
1964#if defined(HI_HAS_AVX)
1965 if constexpr (is_f64x4) {
1966 return numeric_array{_mm256_or_pd(lhs.reg(), rhs.reg())};
1967 } else if constexpr (is_f32x8) {
1968 return numeric_array{_mm256_or_ps(lhs.reg(), rhs.reg())};
1969 } else if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
1970 return numeric_array{
1971 _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg())))};
1972 }
1973#endif
1974#if defined(HI_HAS_SSE2)
1975 if constexpr (is_f64x2) {
1976 return numeric_array{_mm_or_pd(lhs.reg(), rhs.reg())};
1977 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
1978 return numeric_array{_mm_or_si128(lhs.reg(), rhs.reg())};
1979 }
1980#endif
1981#if defined(HI_HAS_SSE)
1982 if constexpr (is_f64x2) {
1983 return numeric_array{_mm_castps_pd(_mm_or_ps(_mm_castps_ps(lhs.reg()), _mm_castps_ps(rhs.reg())))};
1984
1985 } else if constexpr (is_f32x4) {
1986 return numeric_array{_mm_or_ps(lhs.reg(), rhs.reg())};
1987
1988 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
1989 return numeric_array{_mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg())))};
1990 }
1991#endif
1992 }
1993
1994 using uint_type = make_uintxx_t<sizeof(T) * CHAR_BIT>;
1995
1996 auto r = numeric_array{};
1997 for (std::size_t i = 0; i != N; ++i) {
1998 r.v[i] =
1999 std::bit_cast<T>(static_cast<uint_type>(std::bit_cast<uint_type>(lhs.v[i]) | std::bit_cast<uint_type>(rhs.v[i])));
2000 }
2001 return r;
2002 }
2003
2004 [[nodiscard]] friend constexpr numeric_array operator|(numeric_array const& lhs, T const& rhs) noexcept
2005 {
2006 return lhs | broadcast(rhs);
2007 }
2008
2009 [[nodiscard]] friend constexpr numeric_array operator|(T const& lhs, numeric_array const& rhs) noexcept
2010 {
2011 return broadcast(lhs) | rhs;
2012 }
2013
2014 [[nodiscard]] friend constexpr numeric_array operator&(numeric_array const& lhs, numeric_array const& rhs) noexcept
2015 {
2016 if (not std::is_constant_evaluated()) {
2017#if defined(HI_HAS_AVX2)
2018 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2019 return numeric_array{_mm256_and_si256(lhs.reg(), rhs.reg())};
2020 }
2021#endif
2022#if defined(HI_HAS_AVX)
2023 if constexpr (is_f64x4) {
2024 return numeric_array{_mm256_and_pd(lhs.reg(), rhs.reg())};
2025 } else if constexpr (is_f32x8) {
2026 return numeric_array{_mm256_and_ps(lhs.reg(), rhs.reg())};
2027 } else if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2028 return numeric_array{
2029 _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg())))};
2030 }
2031#endif
2032#if defined(HI_HAS_SSE2)
2033 if constexpr (is_f64x2) {
2034 return numeric_array{_mm_and_pd(lhs.reg(), rhs.reg())};
2035 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2036 return numeric_array{_mm_and_si128(lhs.reg(), rhs.reg())};
2037 }
2038#endif
2039#if defined(HI_HAS_SSE)
2040 if constexpr (is_f64x2) {
2041 return numeric_array{_mm_castps_pd(_mm_and_ps(_mm_castps_ps(lhs.reg()), _mm_castps_ps(rhs.reg())))};
2042
2043 } else if constexpr (is_f32x4) {
2044 return numeric_array{_mm_and_ps(lhs.reg(), rhs.reg())};
2045
2046 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2047 return numeric_array{_mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg())))};
2048 }
2049#endif
2050 }
2051
2052 auto r = numeric_array{};
2053 for (std::size_t i = 0; i != N; ++i) {
2054 r.v[i] = lhs.v[i] & rhs.v[i];
2055 }
2056 return r;
2057 }
2058
2059 [[nodiscard]] friend constexpr numeric_array operator&(numeric_array const& lhs, T const& rhs) noexcept
2060 {
2061 return lhs & broadcast(rhs);
2062 }
2063
2064 [[nodiscard]] friend constexpr numeric_array operator&(T const& lhs, numeric_array const& rhs) noexcept
2065 {
2066 return broadcast(lhs) & rhs;
2067 }
2068
2069 [[nodiscard]] friend constexpr numeric_array operator^(numeric_array const& lhs, numeric_array const& rhs) noexcept
2070 {
2071 if (not std::is_constant_evaluated()) {
2072#if defined(HI_HAS_AVX2)
2073 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2074 return numeric_array{_mm256_xor_si256(lhs.reg(), rhs.reg())};
2075 }
2076#endif
2077#if defined(HI_HAS_AVX)
2078 if constexpr (is_f64x4) {
2079 return numeric_array{_mm256_xor_pd(lhs.reg(), rhs.reg())};
2080 } else if constexpr (is_f32x8) {
2081 return numeric_array{_mm256_xor_ps(lhs.reg(), rhs.reg())};
2082 } else if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2083 return numeric_array{
2084 _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg())))};
2085 }
2086#endif
2087#if defined(HI_HAS_SSE2)
2088 if constexpr (is_f64x2) {
2089 return numeric_array{_mm_xor_pd(lhs.reg(), rhs.reg())};
2090 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2091 return numeric_array{_mm_xor_si128(lhs.reg(), rhs.reg())};
2092 }
2093#endif
2094#if defined(HI_HAS_SSE)
2095 if constexpr (is_f64x2) {
2096 return numeric_array{_mm_castps_pd(_mm_xor_ps(_mm_castps_ps(lhs.reg()), _mm_castps_ps(rhs.reg())))};
2097
2098 } else if constexpr (is_f32x4) {
2099 return numeric_array{_mm_xor_ps(lhs.reg(), rhs.reg())};
2100
2101 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2102 return numeric_array{_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg())))};
2103 }
2104#endif
2105 }
2106
2107 auto r = numeric_array{};
2108 for (std::size_t i = 0; i != N; ++i) {
2109 r.v[i] = lhs.v[i] ^ rhs.v[i];
2110 }
2111 return r;
2112 }
2113
2114 [[nodiscard]] friend constexpr numeric_array operator^(numeric_array const& lhs, T const& rhs) noexcept
2115 {
2116 return lhs ^ broadcast(rhs);
2117 }
2118
2119 [[nodiscard]] friend constexpr numeric_array operator^(T const& lhs, numeric_array const& rhs) noexcept
2120 {
2121 return broadcast(lhs) ^ rhs;
2122 }
2123
2124 [[nodiscard]] friend constexpr numeric_array operator+(numeric_array const& lhs, numeric_array const& rhs) noexcept
2125 {
2126 if (not std::is_constant_evaluated()) {
2127#if defined(HI_HAS_AVX2)
2128 if constexpr (is_i64x4 or is_u64x4) {
2129 return numeric_array{_mm256_add_epi64(lhs.reg(), rhs.reg())};
2130 } else if constexpr (is_i32x8 or is_u32x8) {
2131 return numeric_array{_mm256_add_epi32(lhs.reg(), rhs.reg())};
2132 } else if constexpr (is_i16x16 or is_u16x16) {
2133 return numeric_array{_mm256_add_epi16(lhs.reg(), rhs.reg())};
2134 } else if constexpr (is_i8x32 or is_u8x32) {
2135 return numeric_array{_mm256_add_epi8(lhs.reg(), rhs.reg())};
2136 }
2137#endif
2138#if defined(HI_HAS_AVX)
2139 if constexpr (is_f64x4) {
2140 return numeric_array{_mm256_add_pd(lhs.reg(), rhs.reg())};
2141 } else if constexpr (is_f32x8) {
2142 return numeric_array{_mm256_add_ps(lhs.reg(), rhs.reg())};
2143 }
2144#endif
2145#if defined(HI_HAS_SSE2)
2146 if constexpr (is_f64x2) {
2147 return numeric_array{_mm_add_pd(lhs.reg(), rhs.reg())};
2148 } else if constexpr (is_i64x2 or is_u64x2) {
2149 return numeric_array{_mm_add_epi64(lhs.reg(), rhs.reg())};
2150 } else if constexpr (is_i32x4 or is_u32x4) {
2151 return numeric_array{_mm_add_epi32(lhs.reg(), rhs.reg())};
2152 } else if constexpr (is_i16x8 or is_u16x8) {
2153 return numeric_array{_mm_add_epi16(lhs.reg(), rhs.reg())};
2154 } else if constexpr (is_i8x16 or is_u8x16) {
2155 return numeric_array{_mm_add_epi8(lhs.reg(), rhs.reg())};
2156 }
2157#endif
2158#if defined(HI_HAS_SSE)
2159 if constexpr (is_f32x4) {
2160 return numeric_array{_mm_add_ps(lhs.reg(), rhs.reg())};
2161 }
2162#endif
2163 }
2164
2165 auto r = numeric_array{};
2166 for (std::size_t i = 0; i != N; ++i) {
2167 r.v[i] = lhs.v[i] + rhs.v[i];
2168 }
2169 return r;
2170 }
2171
2172 [[nodiscard]] friend constexpr numeric_array operator+(numeric_array const& lhs, T const& rhs) noexcept
2173 {
2174 return lhs + broadcast(rhs);
2175 }
2176
2177 [[nodiscard]] friend constexpr numeric_array operator+(T const& lhs, numeric_array const& rhs) noexcept
2178 {
2179 return broadcast(lhs) + rhs;
2180 }
2181
2182 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const& lhs, numeric_array const& rhs) noexcept
2183 {
2184 if (not std::is_constant_evaluated()) {
2185#if defined(HI_HAS_AVX2)
2186 if constexpr (is_i64x4 or is_u64x4) {
2187 return numeric_array{_mm256_sub_epi64(lhs.reg(), rhs.reg())};
2188 } else if constexpr (is_i32x8 or is_u32x8) {
2189 return numeric_array{_mm256_sub_epi32(lhs.reg(), rhs.reg())};
2190 } else if constexpr (is_i16x16 or is_u16x16) {
2191 return numeric_array{_mm256_sub_epi16(lhs.reg(), rhs.reg())};
2192 } else if constexpr (is_i8x32 or is_u8x32) {
2193 return numeric_array{_mm256_sub_epi8(lhs.reg(), rhs.reg())};
2194 }
2195#endif
2196#if defined(HI_HAS_AVX)
2197 if constexpr (is_f64x4) {
2198 return numeric_array{_mm256_sub_pd(lhs.reg(), rhs.reg())};
2199 } else if constexpr (is_f32x8) {
2200 return numeric_array{_mm256_sub_ps(lhs.reg(), rhs.reg())};
2201 }
2202#endif
2203#if defined(HI_HAS_SSE2)
2204 if constexpr (is_f64x2) {
2205 return numeric_array{_mm_sub_pd(lhs.reg(), rhs.reg())};
2206 } else if constexpr (is_i64x2 or is_u64x2) {
2207 return numeric_array{_mm_sub_epi64(lhs.reg(), rhs.reg())};
2208 } else if constexpr (is_i32x4 or is_u32x4) {
2209 return numeric_array{_mm_sub_epi32(lhs.reg(), rhs.reg())};
2210 } else if constexpr (is_i16x8 or is_u16x8) {
2211 return numeric_array{_mm_sub_epi16(lhs.reg(), rhs.reg())};
2212 } else if constexpr (is_i8x16 or is_u8x16) {
2213 return numeric_array{_mm_sub_epi8(lhs.reg(), rhs.reg())};
2214 }
2215#endif
2216#if defined(HI_HAS_SSE)
2217 if constexpr (is_f32x4) {
2218 return numeric_array{_mm_sub_ps(lhs.reg(), rhs.reg())};
2219 }
2220#endif
2221 }
2222
2223 auto r = numeric_array{};
2224 for (std::size_t i = 0; i != N; ++i) {
2225 r.v[i] = lhs.v[i] - rhs.v[i];
2226 }
2227 return r;
2228 }
2229
2230 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const& lhs, T const& rhs) noexcept
2231 {
2232 return lhs - broadcast(rhs);
2233 }
2234
2235 [[nodiscard]] friend constexpr numeric_array operator-(T const& lhs, numeric_array const& rhs) noexcept
2236 {
2237 return broadcast(lhs) - rhs;
2238 }
2239
2240 [[nodiscard]] friend constexpr numeric_array operator*(numeric_array const& lhs, numeric_array const& rhs) noexcept
2241 {
2242 if (not std::is_constant_evaluated()) {
2243#if defined(HI_HAS_AVX2)
2244 if constexpr (is_i32x8) {
2245 return numeric_array{_mm256_mullo_epi32(lhs.reg(), rhs.reg())};
2246 } else if constexpr (is_u32x8) {
2247 return numeric_array{_mm256_mullo_epu32(lhs.reg(), rhs.reg())};
2248 }
2249#endif
2250#if defined(HI_HAS_AVX)
2251 if constexpr (is_f64x4) {
2252 return numeric_array{_mm256_mul_pd(lhs.reg(), rhs.reg())};
2253 } else if constexpr (is_f32x8) {
2254 return numeric_array{_mm256_mul_ps(lhs.reg(), rhs.reg())};
2255 }
2256#endif
2257#if defined(HI_HAS_SSE4_1)
2258 if constexpr (is_i32x4) {
2259 return numeric_array{_mm_mullo_epi32(lhs.reg(), rhs.reg())};
2260 } else if constexpr (is_f16x4) {
2261 return numeric_array{numeric_array<float, 4>{lhs} * numeric_array<float, 4>{rhs}};
2262 }
2263#endif
2264#if defined(HI_HAS_SSE2)
2265 if constexpr (is_f64x2) {
2266 return numeric_array{_mm_mul_pd(lhs.reg(), rhs.reg())};
2267 }
2268#endif
2269#if defined(HI_HAS_SSE)
2270 if constexpr (is_f32x4) {
2271 return numeric_array{_mm_mul_ps(lhs.reg(), rhs.reg())};
2272 }
2273#endif
2274 }
2275
2276 auto r = numeric_array{};
2277 for (std::size_t i = 0; i != N; ++i) {
2278 r.v[i] = lhs.v[i] * rhs.v[i];
2279 }
2280 return r;
2281 }
2282
2283 [[nodiscard]] friend constexpr numeric_array operator*(numeric_array const& lhs, T const& rhs) noexcept
2284 {
2285 return lhs * broadcast(rhs);
2286 }
2287
2288 [[nodiscard]] friend constexpr numeric_array operator*(T const& lhs, numeric_array const& rhs) noexcept
2289 {
2290 return broadcast(lhs) * rhs;
2291 }
2292
2293 [[nodiscard]] friend constexpr numeric_array operator/(numeric_array const& lhs, numeric_array const& rhs) noexcept
2294 {
2295 if (not std::is_constant_evaluated()) {
2296#if defined(HI_HAS_AVX)
2297 if constexpr (is_f64x4) {
2298 return numeric_array{_mm256_div_pd(lhs.reg(), rhs.reg())};
2299 } else if constexpr (is_f32x8) {
2300 return numeric_array{_mm256_div_ps(lhs.reg(), rhs.reg())};
2301 }
2302#endif
2303#if defined(HI_HAS_SSE2)
2304 if constexpr (is_f64x2) {
2305 return numeric_array{_mm_div_pd(lhs.reg(), rhs.reg())};
2306 }
2307#endif
2308#if defined(HI_HAS_SSE)
2309 if constexpr (is_f32x4) {
2310 return numeric_array{_mm_div_ps(lhs.reg(), rhs.reg())};
2311 } else if constexpr (is_i32x4) {
2312 return numeric_array{_mm_div_epi32(lhs.reg(), rhs.reg())};
2313 }
2314#endif
2315 }
2316
2317 auto r = numeric_array{};
2318 for (std::size_t i = 0; i != N; ++i) {
2319 r.v[i] = lhs.v[i] / rhs.v[i];
2320 }
2321 return r;
2322 }
2323
2324 [[nodiscard]] friend constexpr numeric_array operator/(numeric_array const& lhs, T const& rhs) noexcept
2325 {
2326 return lhs / broadcast(rhs);
2327 }
2328
2329 [[nodiscard]] friend constexpr numeric_array operator/(T const& lhs, numeric_array const& rhs) noexcept
2330 {
2331 return broadcast(lhs) / rhs;
2332 }
2333
2334 [[nodiscard]] friend constexpr numeric_array operator%(numeric_array const& lhs, numeric_array const& rhs) noexcept
2335 {
2336 hilet div_result = floor(lhs / rhs);
2337 return lhs - (div_result * rhs);
2338 }
2339
2340 [[nodiscard]] friend constexpr numeric_array operator%(numeric_array const& lhs, T const& rhs) noexcept
2341 {
2342 return lhs % broadcast(rhs);
2343 }
2344
2345 [[nodiscard]] friend constexpr numeric_array operator%(T const& lhs, numeric_array const& rhs) noexcept
2346 {
2347 return broadcast(lhs) % rhs;
2348 }
2349
2350 [[nodiscard]] friend constexpr numeric_array min(numeric_array const& lhs, numeric_array const& rhs) noexcept
2351 {
2352 if (not std::is_constant_evaluated()) {
2353#if defined(HI_HAS_AVX2)
2354 if constexpr (is_i32x8) {
2355 return numeric_array{_mm256_min_epi32(lhs.reg(), rhs.reg())};
2356 } else if constexpr (is_u32x8) {
2357 return numeric_array{_mm256_min_epu32(lhs.reg(), rhs.reg())};
2358 } else if constexpr (is_i16x16) {
2359 return numeric_array{_mm256_min_epi16(lhs.reg(), rhs.reg())};
2360 } else if constexpr (is_u16x16) {
2361 return numeric_array{_mm256_min_epu16(lhs.reg(), rhs.reg())};
2362 } else if constexpr (is_i8x32) {
2363 return numeric_array{_mm256_min_epi8(lhs.reg(), rhs.reg())};
2364 } else if constexpr (is_u8x32) {
2365 return numeric_array{_mm256_min_epu8(lhs.reg(), rhs.reg())};
2366 }
2367#endif
2368#if defined(HI_HAS_AVX)
2369 if constexpr (is_f64x4) {
2370 return numeric_array{_mm256_min_pd(lhs.reg(), rhs.reg())};
2371 } else if constexpr (is_f32x8) {
2372 return numeric_array{_mm256_min_ps(lhs.reg(), rhs.reg())};
2373 }
2374#endif
2375#if defined(HI_HAS_SSE4_1)
2376 if constexpr (is_i32x4) {
2377 return numeric_array{_mm_min_epi32(lhs.reg(), rhs.reg())};
2378 } else if constexpr (is_u32x4) {
2379 return numeric_array{_mm_min_epu32(lhs.reg(), rhs.reg())};
2380 } else if constexpr (is_u16x8) {
2381 return numeric_array{_mm_min_epu16(lhs.reg(), rhs.reg())};
2382 } else if constexpr (is_i8x16) {
2383 return numeric_array{_mm_min_epi8(lhs.reg(), rhs.reg())};
2384 }
2385#endif
2386#if defined(HI_HAS_SSE2)
2387 if constexpr (is_f64x2) {
2388 return numeric_array{_mm_min_pd(lhs.reg(), rhs.reg())};
2389 } else if constexpr (is_i16x8) {
2390 return numeric_array{_mm_min_epi16(lhs.reg(), rhs.reg())};
2391 } else if constexpr (is_u8x16) {
2392 return numeric_array{_mm_min_epu8(lhs.reg(), rhs.reg())};
2393 }
2394#endif
2395#if defined(HI_HAS_SSE)
2396 if constexpr (is_f32x4) {
2397 return numeric_array{_mm_min_ps(lhs.reg(), rhs.reg())};
2398 }
2399#endif
2400 }
2401
2402 auto r = numeric_array{};
2403 for (std::size_t i = 0; i != N; ++i) {
2404 r.v[i] = std::min(lhs.v[i], rhs.v[i]);
2405 }
2406 return r;
2407 }
2408
2409 [[nodiscard]] friend constexpr numeric_array max(numeric_array const& lhs, numeric_array const& rhs) noexcept
2410 {
2411 if (not std::is_constant_evaluated()) {
2412#if defined(HI_HAS_AVX2)
2413 if constexpr (is_i32x8) {
2414 return numeric_array{_mm256_max_epi32(lhs.reg(), rhs.reg())};
2415 } else if constexpr (is_u32x8) {
2416 return numeric_array{_mm256_max_epu32(lhs.reg(), rhs.reg())};
2417 } else if constexpr (is_i16x16) {
2418 return numeric_array{_mm256_max_epi16(lhs.reg(), rhs.reg())};
2419 } else if constexpr (is_u16x16) {
2420 return numeric_array{_mm256_max_epu16(lhs.reg(), rhs.reg())};
2421 } else if constexpr (is_i8x32) {
2422 return numeric_array{_mm256_max_epi8(lhs.reg(), rhs.reg())};
2423 } else if constexpr (is_u8x32) {
2424 return numeric_array{_mm256_max_epu8(lhs.reg(), rhs.reg())};
2425 }
2426#endif
2427#if defined(HI_HAS_AVX)
2428 if constexpr (is_f64x4) {
2429 return numeric_array{_mm256_max_pd(lhs.reg(), rhs.reg())};
2430 } else if constexpr (is_f32x8) {
2431 return numeric_array{_mm256_max_ps(lhs.reg(), rhs.reg())};
2432 }
2433#endif
2434#if defined(HI_HAS_SSE4_1)
2435 if constexpr (is_i32x4) {
2436 return numeric_array{_mm_max_epi32(lhs.reg(), rhs.reg())};
2437 } else if constexpr (is_u32x4) {
2438 return numeric_array{_mm_max_epu32(lhs.reg(), rhs.reg())};
2439 } else if constexpr (is_u16x8) {
2440 return numeric_array{_mm_max_epu16(lhs.reg(), rhs.reg())};
2441 } else if constexpr (is_i8x16) {
2442 return numeric_array{_mm_max_epi8(lhs.reg(), rhs.reg())};
2443 }
2444#endif
2445#if defined(HI_HAS_SSE2)
2446 if constexpr (is_f64x2) {
2447 return numeric_array{_mm_max_pd(lhs.reg(), rhs.reg())};
2448 } else if constexpr (is_i16x8) {
2449 return numeric_array{_mm_max_epi16(lhs.reg(), rhs.reg())};
2450 } else if constexpr (is_u8x16) {
2451 return numeric_array{_mm_max_epu8(lhs.reg(), rhs.reg())};
2452 }
2453#endif
2454#if defined(HI_HAS_SSE)
2455 if constexpr (is_f32x4) {
2456 return numeric_array{_mm_max_ps(lhs.reg(), rhs.reg())};
2457 }
2458#endif
2459 }
2460
2461 auto r = numeric_array{};
2462 for (std::size_t i = 0; i != N; ++i) {
2463 r.v[i] = std::max(lhs.v[i], rhs.v[i]);
2464 }
2465 return r;
2466 }
2467
2468 [[nodiscard]] friend constexpr numeric_array
2469 clamp(numeric_array const& lhs, numeric_array const& low, numeric_array const& high) noexcept
2470 {
2471 return min(max(lhs, low), high);
2472 }
2473
2474 [[nodiscard]] friend constexpr numeric_array hadd(numeric_array const& lhs, numeric_array const& rhs) noexcept
2475 {
2476 if (not std::is_constant_evaluated()) {
2477#if defined(HI_HAS_AVX2)
2478 if constexpr (is_i32x8 or is_u32x8) {
2479 return numeric_array{_mm256_hadd_epi32(lhs.reg(), rhs.reg())};
2480 } else if constexpr (is_i16x16 or is_u16x16) {
2481 return numeric_array{_mm256_hadd_epi16(lhs.reg(), rhs.reg())};
2482 }
2483#endif
2484#if defined(HI_HAS_AVX)
2485 if constexpr (is_f64x4) {
2486 return numeric_array{_mm256_hadd_pd(lhs.reg(), rhs.reg())};
2487 } else if constexpr (is_f32x8) {
2488 return numeric_array{_mm256_hadd_ps(lhs.reg(), rhs.reg())};
2489 }
2490#endif
2491#if defined(HI_HAS_SSSE3)
2492 if constexpr (is_i32x4 or is_u32x4) {
2493 return numeric_array{_mm_hadd_epi32(lhs.reg(), rhs.reg())};
2494 } else if constexpr (is_i16x8 or is_u16x8) {
2495 return numeric_array{_mm_hadd_epi16(lhs.reg(), rhs.reg())};
2496 }
2497#endif
2498#if defined(HI_HAS_SSE3)
2499 if constexpr (is_f64x2) {
2500 return numeric_array{_mm_hadd_pd(lhs.reg(), rhs.reg())};
2501 } else if constexpr (is_f32x4) {
2502 return numeric_array{_mm_hadd_ps(lhs.reg(), rhs.reg())};
2503 }
2504#endif
2505 }
2506
2507 hi_axiom(N % 2 == 0);
2508
2509 auto r = numeric_array{};
2510
2511 std::size_t src_i = 0;
2512 std::size_t dst_i = 0;
2513 while (src_i != N) {
2514 auto tmp = lhs[src_i++];
2515 tmp += lhs[src_i++];
2516 r.v[dst_i++] = tmp;
2517 }
2518
2519 src_i = 0;
2520 while (src_i != N) {
2521 auto tmp = rhs[src_i++];
2522 tmp += rhs[src_i++];
2523 r.v[dst_i++] = tmp;
2524 }
2525 return r;
2526 }
2527
2528 [[nodiscard]] friend constexpr numeric_array hsub(numeric_array const& lhs, numeric_array const& rhs) noexcept
2529 {
2530 if (not std::is_constant_evaluated()) {
2531#if defined(HI_HAS_AVX2)
2532 if constexpr (is_i32x8 or is_u32x8) {
2533 return numeric_array{_mm256_hsub_epi32(lhs.reg(), rhs.reg())};
2534 } else if constexpr (is_i16x16 or is_u16x16) {
2535 return numeric_array{_mm256_hsub_epi16(lhs.reg(), rhs.reg())};
2536 }
2537#endif
2538#if defined(HI_HAS_AVX)
2539 if constexpr (is_f64x4) {
2540 return numeric_array{_mm256_hsub_pd(lhs.reg(), rhs.reg())};
2541 } else if constexpr (is_f32x8) {
2542 return numeric_array{_mm256_hsub_ps(lhs.reg(), rhs.reg())};
2543 }
2544#endif
2545#if defined(HI_HAS_SSSE3)
2546 if constexpr (is_i32x4 or is_u32x4) {
2547 return numeric_array{_mm_hsub_epi32(lhs.reg(), rhs.reg())};
2548 } else if constexpr (is_i16x8 or is_u16x8) {
2549 return numeric_array{_mm_hsub_epi16(lhs.reg(), rhs.reg())};
2550 }
2551#endif
2552#if defined(HI_HAS_SSE3)
2553 if constexpr (is_f64x2) {
2554 return numeric_array{_mm_hsub_pd(lhs.reg(), rhs.reg())};
2555 } else if constexpr (is_f32x4) {
2556 return numeric_array{_mm_hsub_ps(lhs.reg(), rhs.reg())};
2557 }
2558#endif
2559 }
2560
2561 hi_axiom(N % 2 == 0);
2562
2563 auto r = numeric_array{};
2564
2565 std::size_t src_i = 0;
2566 std::size_t dst_i = 0;
2567 while (src_i != N) {
2568 auto tmp = lhs[src_i++];
2569 tmp -= lhs[src_i++];
2570 r.v[dst_i++] = tmp;
2571 }
2572
2573 src_i = 0;
2574 while (src_i != N) {
2575 auto tmp = rhs[src_i++];
2576 tmp -= rhs[src_i++];
2577 r.v[dst_i++] = tmp;
2578 }
2579 return r;
2580 }
2581
2586 template<std::size_t Mask>
2587 [[nodiscard]] friend constexpr numeric_array addsub(numeric_array const& lhs, numeric_array const& rhs) noexcept
2588 {
2589 constexpr std::size_t not_mask = (1 << N) - 1;
2590 return lhs + neg<Mask ^ not_mask>(rhs);
2591 }
2592
2595 [[nodiscard]] friend constexpr numeric_array cross_2D(numeric_array const& rhs) noexcept requires(N >= 2)
2596 {
2597 hi_axiom(rhs.z() == 0.0f && rhs.is_vector());
2598 return numeric_array{-rhs.y(), rhs.x()};
2599 }
2600
2603 [[nodiscard]] friend constexpr numeric_array normal_2D(numeric_array const& rhs) noexcept requires(N >= 2)
2604 {
2605 return normalize<0b0011>(cross_2D(rhs));
2606 }
2607
2611 [[nodiscard]] friend constexpr float cross_2D(numeric_array const& lhs, numeric_array const& rhs) noexcept requires(N >= 2)
2612 {
2613 hilet tmp1 = rhs.yxwz();
2614 hilet tmp2 = lhs * tmp1;
2615 hilet tmp3 = hsub(tmp2, tmp2);
2616 return get<0>(tmp3);
2617 }
2618
2619 // x=a.y*b.z - a.z*b.y
2620 // y=a.z*b.x - a.x*b.z
2621 // z=a.x*b.y - a.y*b.x
2622 // w=a.w*b.w - a.w*b.w
2623 [[nodiscard]] constexpr friend numeric_array cross_3D(numeric_array const& lhs, numeric_array const& rhs) noexcept
2624 requires(N == 4)
2625 {
2626 hilet a_left = lhs.yzxw();
2627 hilet b_left = rhs.zxyw();
2628 hilet left = a_left * b_left;
2629
2630 hilet a_right = lhs.zxyw();
2631 hilet b_right = rhs.yzxw();
2632 hilet right = a_right * b_right;
2633 return left - right;
2634 }
2635
2636 [[nodiscard]] static constexpr numeric_array byte_srl_shuffle_indices(unsigned int rhs) requires(is_i8x16)
2637 {
2638 static_assert(std::endian::native == std::endian::little);
2639
2640 auto r = numeric_array{};
2641 for (auto i = 0; i != 16; ++i) {
2642 if ((i + rhs) < 16) {
2643 r[i] = narrow_cast<int8_t>(i + rhs);
2644 } else {
2645 // Indices set to -1 result in a zero after a byte shuffle.
2646 r[i] = -1;
2647 }
2648 }
2649 return r;
2650 }
2651
2652 [[nodiscard]] static constexpr numeric_array byte_sll_shuffle_indices(unsigned int rhs) requires(is_i8x16)
2653 {
2654 static_assert(std::endian::native == std::endian::little);
2655
2656 auto r = numeric_array{};
2657 for (auto i = 0; i != 16; ++i) {
2658 if ((i - rhs) >= 0) {
2659 r[i] = narrow_cast<int8_t>(i - rhs);
2660 } else {
2661 // Indices set to -1 result in a zero after a byte shuffle.
2662 r[i] = -1;
2663 }
2664 }
2665 return r;
2666 }
2667
2670 [[nodiscard]] friend constexpr numeric_array shuffle(numeric_array const& lhs, numeric_array const& rhs) noexcept
2671 requires(std::is_integral_v<value_type>)
2672 {
2673 if (!std::is_constant_evaluated()) {
2674#if defined(HI_HAS_SSSE3)
2675 if constexpr (is_i8x16 or is_u8x16) {
2676 return numeric_array{_mm_shuffle_epi8(lhs.reg(), rhs.reg())};
2677 }
2678#endif
2679 }
2680
2681 auto r = numeric_array{};
2682 for (std::size_t i = 0; i != N; ++i) {
2683 if (rhs[i] >= 0) {
2684 r[i] = lhs[rhs[i] & 0xf];
2685 } else {
2686 r[i] = 0;
2687 }
2688 }
2689
2690 return r;
2691 }
2692
2695 [[nodiscard]] friend constexpr numeric_array midpoint(numeric_array const& p1, numeric_array const& p2) noexcept
2696 {
2697 hi_axiom(p1.is_point());
2698 hi_axiom(p2.is_point());
2699 return (p1 + p2) * 0.5f;
2700 }
2701
2704 [[nodiscard]] friend constexpr numeric_array reflect_point(numeric_array const& p, numeric_array const anchor) noexcept
2705 {
2706 hi_axiom(p.is_point());
2707 hi_axiom(anchor.is_point());
2708 return anchor - (p - anchor);
2709 }
2710
2711 hi_warning_push();
2712 // C26494 Variable '...' is uninitialized. Always initialize an object (type.5).
2713 // Internal to _MM_TRANSPOSE4_PS
2714 hi_warning_ignore_msvc(26494);
2715 template<typename... Columns>
2716 [[nodiscard]] friend constexpr std::array<numeric_array, N> transpose(Columns const&...columns) noexcept
2717 {
2718 static_assert(sizeof...(Columns) == N, "Can only transpose square matrices");
2719
2720 if (not std::is_constant_evaluated()) {
2721#if defined(HI_HAS_SSE)
2722 if constexpr (is_f32x4 and sizeof...(Columns) == 4) {
2723 auto tmp = std::array<__m128, N>{columns.reg()...};
2724 _MM_TRANSPOSE4_PS(std::get<0>(tmp), std::get<1>(tmp), std::get<2>(tmp), std::get<3>(tmp));
2725 return {
2726 numeric_array{get<0>(tmp)},
2727 numeric_array{get<1>(tmp)},
2728 numeric_array{get<2>(tmp)},
2729 numeric_array{get<3>(tmp)}};
2730#endif
2731 }
2732 }
2733
2735 auto f = [&r, &columns... ]<std::size_t... Ints>(std::index_sequence<Ints...>)
2736 {
2737 auto tf = [&r](auto i, auto v) {
2738 for (std::size_t j = 0; j != N; ++j) {
2739 r[j][i] = v[j];
2740 }
2741 return 0;
2742 };
2743 static_cast<void>((tf(Ints, columns) + ...));
2744 };
2745 f(std::make_index_sequence<sizeof...(columns)>{});
2746 return r;
2747 }
2748 hi_warning_pop();
2749
2750 [[nodiscard]] constexpr friend numeric_array composit(numeric_array const& under, numeric_array const& over) noexcept
2751 requires(N == 4 && std::is_floating_point_v<T>)
2752 {
2753 if (over.is_transparent()) {
2754 return under;
2755 }
2756 if (over.is_opaque()) {
2757 return over;
2758 }
2759
2760 hilet over_alpha = over.wwww();
2761 hilet under_alpha = under.wwww();
2762
2763 hilet over_color = over.xyz1();
2764 hilet under_color = under.xyz1();
2765
2766 hilet output_color = over_color * over_alpha + under_color * under_alpha * (T{1} - over_alpha);
2767
2768 return output_color / output_color.www1();
2769 }
2770
2771 [[nodiscard]] constexpr friend numeric_array composit(numeric_array const& under, numeric_array const& over) noexcept
2772 requires(is_f16x4)
2773 {
2774 return numeric_array{composit(static_cast<numeric_array<float, 4>>(under), static_cast<numeric_array<float, 4>>(over))};
2775 }
2776
2777 [[nodiscard]] friend std::string to_string(numeric_array const& rhs) noexcept
2778 {
2779 auto r = std::string{};
2780
2781 r += '(';
2782 for (std::size_t i = 0; i != N; ++i) {
2783 if (i != 0) {
2784 r += "; ";
2785 }
2786 r += std::format("{}", rhs[i]);
2787 }
2788 r += ')';
2789 return r;
2790 }
2791
2792 friend std::ostream& operator<<(std::ostream& lhs, numeric_array const& rhs)
2793 {
2794 return lhs << to_string(rhs);
2795 }
2796
2801 template<std::size_t FromElement, std::size_t ToElement>
2802 [[nodiscard]] constexpr friend numeric_array insert(numeric_array const& lhs, numeric_array const& rhs)
2803 {
2804 auto r = numeric_array{};
2805
2806 if (!std::is_constant_evaluated()) {
2807#if defined(HI_HAS_SSE4_1)
2808 if constexpr (is_f32x4) {
2809 constexpr uint8_t insert_mask = static_cast<uint8_t>((FromElement << 6) | (ToElement << 4));
2810 return numeric_array{_mm_insert_ps(lhs.reg(), rhs.reg(), insert_mask)};
2811
2812 } else if constexpr (is_i32x4 or is_u32x4) {
2813 constexpr uint8_t insert_mask = static_cast<uint8_t>((FromElement << 6) | (ToElement << 4));
2814 return numeric_array{
2815 _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg()), insert_mask))};
2816 }
2817#endif
2818#if defined(HI_HAS_SSE2)
2819 if constexpr (is_f64x2) {
2820 if constexpr (FromElement == 0 and ToElement == 0) {
2821 return numeric_array{_mm_shuffle_pd(rhs.reg(), lhs.reg(), 0b10)};
2822 } else if constexpr (FromElement == 1 and ToElement == 0) {
2823 return numeric_array{_mm_shuffle_pd(rhs.reg(), lhs.reg(), 0b11)};
2824 } else if constexpr (FromElement == 0 and ToElement == 1) {
2825 return numeric_array{_mm_shuffle_pd(lhs.reg(), rhs.reg(), 0b00)};
2826 } else {
2827 return numeric_array{_mm_shuffle_pd(lhs.reg(), rhs.reg(), 0b10)};
2828 }
2829
2830 } else if constexpr (is_i64x2 or is_u64x2) {
2831 hilet lhs_ = _mm_castsi128_pd(lhs.reg());
2832 hilet rhs_ = _mm_castsi128_pd(rhs.reg());
2833
2834 if constexpr (FromElement == 0 and ToElement == 0) {
2835 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(rhs_, lhs_, 0b10))};
2836 } else if constexpr (FromElement == 1 and ToElement == 0) {
2837 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(rhs_, lhs_, 0b11))};
2838 } else if constexpr (FromElement == 0 and ToElement == 1) {
2839 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(lhs_, rhs_, 0b00))};
2840 } else {
2841 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(lhs_, rhs_, 0b10))};
2842 }
2843 }
2844#endif
2845 }
2846
2847 for (std::size_t i = 0; i != N; ++i) {
2848 r[i] = (i == ToElement) ? rhs[FromElement] : lhs[i];
2849 }
2850
2851 return r;
2852 }
2853
2861 template<ssize_t... Elements>
2862 [[nodiscard]] constexpr numeric_array swizzle() const
2863 {
2864 static_assert(sizeof...(Elements) <= N);
2865
2866 if (!std::is_constant_evaluated()) {
2867#if defined(HI_HAS_AVX)
2868 if constexpr (is_f64x2) {
2869 return numeric_array{_mm_swizzle_pd<Elements...>(reg())};
2870 } else if constexpr (is_f32x4) {
2871 return numeric_array{_mm_swizzle_ps<Elements...>(reg())};
2872 } else if constexpr (is_i64x2 or is_u64x2) {
2873 return numeric_array{_mm_swizzle_epi64<Elements...>(reg())};
2874 } else if constexpr (is_i32x4 or is_u32x4) {
2875 return numeric_array{_mm_swizzle_epi32<Elements...>(reg())};
2876 }
2877#endif
2878 }
2879
2880 auto r = numeric_array{};
2881 swizzle_detail<0, Elements...>(r);
2882 return r;
2883 }
2884
2885#define SWIZZLE(swizzle_name, D, ...) \
2886 [[nodiscard]] constexpr numeric_array swizzle_name() const noexcept requires(D == N) \
2887 { \
2888 return swizzle<__VA_ARGS__>(); \
2889 }
2890
2891#define SWIZZLE_4D_GEN1(name, ...) \
2892 SWIZZLE(name##0, 4, __VA_ARGS__, get_zero) \
2893 SWIZZLE(name##1, 4, __VA_ARGS__, get_one) \
2894 SWIZZLE(name##x, 4, __VA_ARGS__, 0) \
2895 SWIZZLE(name##y, 4, __VA_ARGS__, 1) \
2896 SWIZZLE(name##z, 4, __VA_ARGS__, 2) \
2897 SWIZZLE(name##w, 4, __VA_ARGS__, 3)
2898
2899#define SWIZZLE_4D_GEN2(name, ...) \
2900 SWIZZLE_4D_GEN1(name##0, __VA_ARGS__, get_zero) \
2901 SWIZZLE_4D_GEN1(name##1, __VA_ARGS__, get_one) \
2902 SWIZZLE_4D_GEN1(name##x, __VA_ARGS__, 0) \
2903 SWIZZLE_4D_GEN1(name##y, __VA_ARGS__, 1) \
2904 SWIZZLE_4D_GEN1(name##z, __VA_ARGS__, 2) \
2905 SWIZZLE_4D_GEN1(name##w, __VA_ARGS__, 3)
2906
2907#define SWIZZLE_4D_GEN3(name, ...) \
2908 SWIZZLE_4D_GEN2(name##0, __VA_ARGS__, get_zero) \
2909 SWIZZLE_4D_GEN2(name##1, __VA_ARGS__, get_one) \
2910 SWIZZLE_4D_GEN2(name##x, __VA_ARGS__, 0) \
2911 SWIZZLE_4D_GEN2(name##y, __VA_ARGS__, 1) \
2912 SWIZZLE_4D_GEN2(name##z, __VA_ARGS__, 2) \
2913 SWIZZLE_4D_GEN2(name##w, __VA_ARGS__, 3)
2914
2915 SWIZZLE_4D_GEN3(_0, get_zero)
2916 SWIZZLE_4D_GEN3(_1, get_one)
2917 SWIZZLE_4D_GEN3(x, 0)
2918 SWIZZLE_4D_GEN3(y, 1)
2919 SWIZZLE_4D_GEN3(z, 2)
2920 SWIZZLE_4D_GEN3(w, 3)
2921
2922#define SWIZZLE_3D_GEN1(name, ...) \
2923 SWIZZLE(name##0, 3, __VA_ARGS__, get_zero) \
2924 SWIZZLE(name##1, 3, __VA_ARGS__, get_one) \
2925 SWIZZLE(name##x, 3, __VA_ARGS__, 0) \
2926 SWIZZLE(name##y, 3, __VA_ARGS__, 1) \
2927 SWIZZLE(name##z, 3, __VA_ARGS__, 2)
2928
2929#define SWIZZLE_3D_GEN2(name, ...) \
2930 SWIZZLE_3D_GEN1(name##0, __VA_ARGS__, get_zero) \
2931 SWIZZLE_3D_GEN1(name##1, __VA_ARGS__, get_one) \
2932 SWIZZLE_3D_GEN1(name##x, __VA_ARGS__, 0) \
2933 SWIZZLE_3D_GEN1(name##y, __VA_ARGS__, 1) \
2934 SWIZZLE_3D_GEN1(name##z, __VA_ARGS__, 2)
2935
2936 SWIZZLE_3D_GEN2(_0, get_zero)
2937 SWIZZLE_3D_GEN2(_1, get_one)
2938 SWIZZLE_3D_GEN2(x, 0)
2939 SWIZZLE_3D_GEN2(y, 1)
2940 SWIZZLE_3D_GEN2(z, 2)
2941
2942#define SWIZZLE_2D_GEN1(name, ...) \
2943 SWIZZLE(name##0, 2, __VA_ARGS__, get_zero) \
2944 SWIZZLE(name##1, 2, __VA_ARGS__, get_one) \
2945 SWIZZLE(name##x, 2, __VA_ARGS__, 0) \
2946 SWIZZLE(name##y, 2, __VA_ARGS__, 1)
2947
2948 SWIZZLE_2D_GEN1(_0, get_zero)
2949 SWIZZLE_2D_GEN1(_1, get_one)
2950 SWIZZLE_2D_GEN1(x, 0)
2951 SWIZZLE_2D_GEN1(y, 1)
2952
2953#undef SWIZZLE
2954#undef SWIZZLE_4D_GEN1
2955#undef SWIZZLE_4D_GEN2
2956#undef SWIZZLE_4D_GEN3
2957#undef SWIZZLE_3D_GEN1
2958#undef SWIZZLE_3D_GEN2
2959#undef SWIZZLE_2D_GEN1
2960
2961 template<ssize_t I, ssize_t FirstElement, ssize_t... RestElements>
2962 constexpr void swizzle_detail(numeric_array& r) const noexcept
2963 {
2964 static_assert(I < narrow_cast<ssize_t>(N));
2965 static_assert(FirstElement >= -2 && FirstElement < narrow_cast<ssize_t>(N), "Index out of bounds");
2966
2967 get<I>(r) = get<FirstElement>(*this);
2968 if constexpr (sizeof...(RestElements) != 0) {
2969 swizzle_detail<I + 1, RestElements...>(r);
2970 }
2971 }
2972};
2973
2974using i8x1 = numeric_array<int8_t, 1>;
2975using i8x2 = numeric_array<int8_t, 2>;
2976using i8x4 = numeric_array<int8_t, 4>;
2977using i8x8 = numeric_array<int8_t, 8>;
2978using i8x16 = numeric_array<int8_t, 16>;
2979using i8x32 = numeric_array<int8_t, 32>;
2980using i8x64 = numeric_array<int8_t, 64>;
2981
2982using u8x1 = numeric_array<uint8_t, 1>;
2983using u8x2 = numeric_array<uint8_t, 2>;
2984using u8x4 = numeric_array<uint8_t, 4>;
2985using u8x8 = numeric_array<uint8_t, 8>;
2986using u8x16 = numeric_array<uint8_t, 16>;
2987using u8x32 = numeric_array<uint8_t, 32>;
2988using u8x64 = numeric_array<uint8_t, 64>;
2989
2990using i16x1 = numeric_array<int16_t, 1>;
2991using i16x2 = numeric_array<int16_t, 2>;
2992using i16x4 = numeric_array<int16_t, 4>;
2993using i16x8 = numeric_array<int16_t, 8>;
2994using i16x16 = numeric_array<int16_t, 16>;
2995using i16x32 = numeric_array<int16_t, 32>;
2996
2997using u16x1 = numeric_array<uint16_t, 1>;
2998using u16x2 = numeric_array<uint16_t, 2>;
2999using u16x4 = numeric_array<uint16_t, 4>;
3000using u16x8 = numeric_array<uint16_t, 8>;
3001using u16x16 = numeric_array<uint16_t, 16>;
3002using u16x32 = numeric_array<uint16_t, 32>;
3003
3004using f16x4 = numeric_array<float16, 4>;
3005
3006using i32x1 = numeric_array<int32_t, 1>;
3007using i32x2 = numeric_array<int32_t, 2>;
3008using i32x4 = numeric_array<int32_t, 4>;
3009using i32x8 = numeric_array<int32_t, 8>;
3010using i32x16 = numeric_array<int32_t, 16>;
3011
3012using u32x1 = numeric_array<uint32_t, 1>;
3013using u32x2 = numeric_array<uint32_t, 2>;
3014using u32x4 = numeric_array<uint32_t, 4>;
3015using u32x8 = numeric_array<uint32_t, 8>;
3016using u32x16 = numeric_array<uint32_t, 16>;
3017
3018using f32x1 = numeric_array<float, 1>;
3019using f32x2 = numeric_array<float, 2>;
3020using f32x4 = numeric_array<float, 4>;
3021using f32x8 = numeric_array<float, 8>;
3022using f32x16 = numeric_array<float, 16>;
3023
3024using i64x1 = numeric_array<int64_t, 1>;
3025using i64x2 = numeric_array<int64_t, 2>;
3026using i64x4 = numeric_array<int64_t, 4>;
3027using i64x8 = numeric_array<int64_t, 8>;
3028
3029using u64x1 = numeric_array<uint64_t, 1>;
3030using u64x2 = numeric_array<uint64_t, 2>;
3031using u64x4 = numeric_array<uint64_t, 4>;
3032using u64x8 = numeric_array<uint64_t, 8>;
3033
3034using f64x1 = numeric_array<double, 1>;
3035using f64x2 = numeric_array<double, 2>;
3036using f64x4 = numeric_array<double, 4>;
3037using f64x8 = numeric_array<double, 8>;
3038
3039} // namespace hi::inline v1
3040
3041template<class T, std::size_t N>
3042struct std::tuple_size<hi::numeric_array<T, N>> : std::integral_constant<std::size_t, N> {
3043};
3044
3045template<std::size_t I, class T, std::size_t N>
3046struct std::tuple_element<I, hi::numeric_array<T, N>> {
3047 using type = T;
3048};
3049
3050hi_warning_pop();
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:133
Miscellaneous math functions.
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
Functions and macros for handling architectural difference between compilers, CPUs and operating syst...
STL namespace.
DOXYGEN BUG.
Definition algorithm.hpp:15
void composit(pixel_map< sfloat_rgba16 > &dst, hi::color color, graphic_path const &mask) noexcept
Composit color onto the destination image where the mask is solid.
geometry/margins.hpp
Definition assert.hpp:18
std::ptrdiff_t ssize_t
Signed size/index into an array.
Definition utility.hpp:173
Definition numeric_array.hpp:59
friend constexpr T get(numeric_array &&rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:1037
hi_force_inline friend constexpr T squared_hypot(numeric_array const &rhs) noexcept
Take the squared length of the vector.
Definition numeric_array.hpp:1538
constexpr void store(std::byte *ptr) const noexcept
Store a numeric array into memory.
Definition numeric_array.hpp:689
friend T hypot(numeric_array const &rhs) noexcept
Take the length of the vector.
Definition numeric_array.hpp:1526
hi_force_inline friend constexpr T dot(numeric_array const &lhs, numeric_array const &rhs) noexcept
Take a dot product.
Definition numeric_array.hpp:1498
friend constexpr numeric_array neg(numeric_array rhs) noexcept
Negate individual elements.
Definition numeric_array.hpp:1301
friend constexpr T get(numeric_array const &rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:1147
friend constexpr numeric_array cross_2D(numeric_array const &rhs) noexcept
Calculate the 2D normal on a 2D vector.
Definition numeric_array.hpp:2595
friend constexpr numeric_array reflect_point(numeric_array const &p, numeric_array const anchor) noexcept
Find the point on the other side and at the same distance of an anchor-point.
Definition numeric_array.hpp:2704
friend constexpr numeric_array midpoint(numeric_array const &p1, numeric_array const &p2) noexcept
Find a point at the midpoint between two points.
Definition numeric_array.hpp:2695
static constexpr numeric_array load(std::byte const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:662
friend constexpr numeric_array rotl(numeric_array const &lhs, unsigned int rhs) noexcept
Rotate left.
Definition numeric_array.hpp:1934
friend constexpr T & get(numeric_array &rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:1025
friend constexpr numeric_array shuffle(numeric_array const &lhs, numeric_array const &rhs) noexcept
Shuffle a 16x byte array, using the indices from the right-hand-side.
Definition numeric_array.hpp:2670
friend constexpr numeric_array rotr(numeric_array const &lhs, unsigned int rhs) noexcept
Rotate right.
Definition numeric_array.hpp:1947
friend constexpr numeric_array blend(numeric_array const &a, numeric_array const &b, numeric_array const &mask)
Blend the values using a dynamic mask.
Definition numeric_array.hpp:1251
constexpr friend T extract(numeric_array const &rhs) noexcept
Extract an element from the array.
Definition numeric_array.hpp:1057
constexpr friend numeric_array insert(numeric_array const &lhs, T rhs) noexcept
Insert a value in the array.
Definition numeric_array.hpp:1112
friend constexpr T rcp_hypot(numeric_array const &rhs) noexcept
Take a reciprocal of the length.
Definition numeric_array.hpp:1549
static constexpr numeric_array load(T const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:673
friend constexpr float cross_2D(numeric_array const &lhs, numeric_array const &rhs) noexcept
Calculate the cross-product between two 2D vectors.
Definition numeric_array.hpp:2611
friend constexpr numeric_array normal_2D(numeric_array const &rhs) noexcept
Calculate the 2D unit-normal on a 2D vector.
Definition numeric_array.hpp:2603
static constexpr numeric_array interleave_lo(numeric_array a, numeric_array b) noexcept
Interleave the first words in both arrays.
Definition numeric_array.hpp:616
friend constexpr numeric_array addsub(numeric_array const &lhs, numeric_array const &rhs) noexcept
Add or subtract individual elements.
Definition numeric_array.hpp:2587
static constexpr numeric_array load(std::byte const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:651
friend constexpr numeric_array normalize(numeric_array const &rhs) noexcept
Normalize a vector.
Definition numeric_array.hpp:1570
constexpr friend numeric_array insert(numeric_array const &lhs, numeric_array const &rhs)
Insert an element from rhs into the result.
Definition numeric_array.hpp:2802
friend constexpr numeric_array blend(numeric_array const &lhs, numeric_array const &rhs) noexcept
Blend two numeric arrays.
Definition numeric_array.hpp:1197
constexpr numeric_array swizzle() const
swizzle around the elements of the numeric array.
Definition numeric_array.hpp:2862
friend constexpr numeric_array zero(numeric_array rhs) noexcept
Set individual elements to zero.
Definition numeric_array.hpp:1165
Definition concepts.hpp:36
Definition concepts.hpp:39
T back(T... args)
T begin(T... args)
T ceil(T... args)
T data(T... args)
T empty(T... args)
T end(T... args)
T floor(T... args)
T front(T... args)
T max(T... args)
T max_size(T... args)
T memcpy(T... args)
T min(T... args)
T round(T... args)
T size(T... args)
T sqrt(T... args)
T to_string(T... args)