HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
numeric_array.hpp
1// Copyright Take Vos 2020-2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../architecture.hpp"
8#include "../concepts.hpp"
9#include "../cast.hpp"
10#include "../type_traits.hpp"
11#include "../float16.hpp"
12#include "../math.hpp"
13
14#if defined(HI_HAS_AVX)
15#include "swizzle_avx.hpp"
16#include <immintrin.h> // AVX, AVX2, FMA
17#endif
18#if defined(HI_HAS_SSE4_2)
19#include <nmmintrin.h> // SSE4.2
20#endif
21#if defined(HI_HAS_SSE4_1)
22#include "float16_sse4_1.hpp"
23#include <smmintrin.h> // SSE4.1
24#include <ammintrin.h> // SSE4A
25#endif
26#if defined(HI_HAS_SSSE3)
27#include <tmmintrin.h> // SSSE3
28#endif
29#if defined(HI_HAS_SSE3)
30#include <pmmintrin.h> // SSE3
31#endif
32#if defined(HI_HAS_SSE2)
33#include <emmintrin.h> // SSE2
34#endif
35#if defined(HI_HAS_SSE)
36#include <xmmintrin.h> // SSE
37#endif
38
39#include <cstdint>
40#include <ostream>
41#include <string>
42#include <array>
43#include <type_traits>
44#include <concepts>
45#include <bit>
46#include <climits>
47#include <utility>
48
49hi_warning_push();
50// C4702 unreachable code: Suppressed due intrinsics and std::is_constant_evaluated()
51hi_warning_ignore_msvc(4702);
52// C26490: Don't use reinterpret_cast (type.1).
53// Needed for casting pointers to or from SSE registers.
54hi_warning_ignore_msvc(26490);
55
56namespace hi::inline v1 {
57
58template<numeric_limited T, std::size_t N>
61 using value_type = typename container_type::value_type;
62 using size_type = typename container_type::size_type;
63 using difference_type = typename container_type::difference_type;
64 using reference = typename container_type::reference;
65 using const_reference = typename container_type::const_reference;
66 using pointer = typename container_type::pointer;
67 using const_pointer = typename container_type::const_pointer;
68 using iterator = typename container_type::iterator;
69 using const_iterator = typename container_type::const_iterator;
70
71 constexpr static bool is_i8x1 = std::is_same_v<T, int8_t> && N == 1;
72 constexpr static bool is_i8x2 = std::is_same_v<T, int8_t> && N == 2;
73 constexpr static bool is_i8x4 = std::is_same_v<T, int8_t> && N == 4;
74 constexpr static bool is_i8x8 = std::is_same_v<T, int8_t> && N == 8;
75 constexpr static bool is_i8x16 = std::is_same_v<T, int8_t> && N == 16;
76 constexpr static bool is_i8x32 = std::is_same_v<T, int8_t> && N == 32;
77 constexpr static bool is_i8x64 = std::is_same_v<T, int8_t> && N == 64;
78 constexpr static bool is_u8x1 = std::is_same_v<T, uint8_t> && N == 1;
79 constexpr static bool is_u8x2 = std::is_same_v<T, uint8_t> && N == 2;
80 constexpr static bool is_u8x4 = std::is_same_v<T, uint8_t> && N == 4;
81 constexpr static bool is_u8x8 = std::is_same_v<T, uint8_t> && N == 8;
82 constexpr static bool is_u8x16 = std::is_same_v<T, uint8_t> && N == 16;
83 constexpr static bool is_u8x32 = std::is_same_v<T, uint8_t> && N == 32;
84 constexpr static bool is_u8x64 = std::is_same_v<T, uint8_t> && N == 64;
85
86 constexpr static bool is_i16x1 = std::is_same_v<T, int16_t> && N == 1;
87 constexpr static bool is_i16x2 = std::is_same_v<T, int16_t> && N == 2;
88 constexpr static bool is_i16x4 = std::is_same_v<T, int16_t> && N == 4;
89 constexpr static bool is_i16x8 = std::is_same_v<T, int16_t> && N == 8;
90 constexpr static bool is_i16x16 = std::is_same_v<T, int16_t> && N == 16;
91 constexpr static bool is_i16x32 = std::is_same_v<T, int16_t> && N == 32;
92 constexpr static bool is_u16x1 = std::is_same_v<T, uint16_t> && N == 1;
93 constexpr static bool is_u16x2 = std::is_same_v<T, uint16_t> && N == 2;
94 constexpr static bool is_u16x4 = std::is_same_v<T, uint16_t> && N == 4;
95 constexpr static bool is_u16x8 = std::is_same_v<T, uint16_t> && N == 8;
96 constexpr static bool is_u16x16 = std::is_same_v<T, uint16_t> && N == 16;
97 constexpr static bool is_u16x32 = std::is_same_v<T, uint16_t> && N == 32;
98 constexpr static bool is_f16x4 = std::is_same_v<T, float16> && N == 4;
99
100 constexpr static bool is_i32x1 = std::is_same_v<T, int32_t> && N == 1;
101 constexpr static bool is_i32x2 = std::is_same_v<T, int32_t> && N == 2;
102 constexpr static bool is_i32x4 = std::is_same_v<T, int32_t> && N == 4;
103 constexpr static bool is_i32x8 = std::is_same_v<T, int32_t> && N == 8;
104 constexpr static bool is_i32x16 = std::is_same_v<T, int32_t> && N == 16;
105 constexpr static bool is_u32x1 = std::is_same_v<T, uint32_t> && N == 1;
106 constexpr static bool is_u32x2 = std::is_same_v<T, uint32_t> && N == 2;
107 constexpr static bool is_u32x4 = std::is_same_v<T, uint32_t> && N == 4;
108 constexpr static bool is_u32x8 = std::is_same_v<T, uint32_t> && N == 8;
109 constexpr static bool is_u32x16 = std::is_same_v<T, uint32_t> && N == 16;
110 constexpr static bool is_f32x1 = std::is_same_v<T, float> && N == 1;
111 constexpr static bool is_f32x2 = std::is_same_v<T, float> && N == 2;
112 constexpr static bool is_f32x4 = std::is_same_v<T, float> && N == 4;
113 constexpr static bool is_f32x8 = std::is_same_v<T, float> && N == 8;
114 constexpr static bool is_f32x16 = std::is_same_v<T, float> && N == 16;
115
116 constexpr static bool is_i64x1 = std::is_same_v<T, int64_t> && N == 1;
117 constexpr static bool is_i64x2 = std::is_same_v<T, int64_t> && N == 2;
118 constexpr static bool is_i64x4 = std::is_same_v<T, int64_t> && N == 4;
119 constexpr static bool is_i64x8 = std::is_same_v<T, int64_t> && N == 8;
120 constexpr static bool is_u64x1 = std::is_same_v<T, uint64_t> && N == 1;
121 constexpr static bool is_u64x2 = std::is_same_v<T, uint64_t> && N == 2;
122 constexpr static bool is_u64x4 = std::is_same_v<T, uint64_t> && N == 4;
123 constexpr static bool is_u64x8 = std::is_same_v<T, uint64_t> && N == 8;
124 constexpr static bool is_f64x1 = std::is_same_v<T, double> && N == 1;
125 constexpr static bool is_f64x2 = std::is_same_v<T, double> && N == 2;
126 constexpr static bool is_f64x4 = std::is_same_v<T, double> && N == 4;
127 constexpr static bool is_f64x8 = std::is_same_v<T, double> && N == 8;
128
130
131 constexpr numeric_array() noexcept
132 {
133 if (not std::is_constant_evaluated()) {
134#if defined(HI_HAS_AVX)
135 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x16 or is_u16x16 or is_i8x32 or is_u8x32) {
136 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), _mm256_setzero_si256());
137 return;
138 } else if constexpr (is_f64x4) {
139 _mm256_storeu_pd(reinterpret_cast<__m256d *>(v.data()), _mm256_setzero_pd());
140 return;
141 } else if constexpr (is_f32x8) {
142 _mm256_storeu_ps(v.data(), _mm256_setzero_ps());
143 return;
144 }
145#endif
146#if defined(HI_HAS_SSE2)
147 if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_u8x16) {
148 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), _mm_setzero_si128());
149 return;
150 } else if constexpr (is_f64x2) {
151 _mm_storeu_pd(reinterpret_cast<__m128d *>(v.data()), _mm_setzero_pd());
152 return;
153 }
154#endif
155#if defined(HI_HAS_SSE)
156 if constexpr (is_f32x4) {
157 _mm_storeu_ps(v.data(), _mm_setzero_ps());
158 return;
159 }
160#endif
161 }
162
163 for (auto i = 0_uz; i != N; ++i) {
164 v[i] = T{};
165 }
166 }
167
168 constexpr numeric_array(numeric_array const& rhs) noexcept = default;
169 constexpr numeric_array(numeric_array&& rhs) noexcept = default;
170 constexpr numeric_array& operator=(numeric_array const& rhs) noexcept = default;
171 constexpr numeric_array& operator=(numeric_array&& rhs) noexcept = default;
172
173 template<numeric_limited U, std::size_t M>
174 [[nodiscard]] constexpr explicit numeric_array(numeric_array<U, M> const& other) noexcept : v()
175 {
176 if (!std::is_constant_evaluated()) {
177#if defined(HI_HAS_AVX)
178 if constexpr (is_f64x4 and other.is_f32x4) {
179 v = numeric_array{_mm256_cvteps_pd(other.reg())};
180 return;
181 } else if constexpr (is_f64x4 and other.is_i32x4) {
182 v = numeric_array{_mm256_cvtepi32_pd(other.reg())};
183 return;
184 } else if constexpr (is_f32x4 and other.is_f64x4) {
185 v = numeric_array{_mm256_cvtpd_ps(other.reg())};
186 return;
187 } else if constexpr (is_i32x4 and other.is_f64x4) {
188 v = numeric_array{_mm256_cvtpd_epi32(other.reg())};
189 return;
190 } else if constexpr (is_i32x8 and other.is_f32x8) {
191 v = numeric_array{_mm256_cvtps_epi32(other.reg())};
192 return;
193 } else if constexpr (is_f32x8 and other.is_i32x8) {
194 v = numeric_array{_mm256_cvtepi32_ps(other.reg())};
195 return;
196 }
197#endif
198#if defined(HI_HAS_SSE4_1)
199 if constexpr (is_u8x4 and other.is_f32x4) {
200 hilet i32_4 = _mm_cvtps_epi32(other.reg());
201 hilet i16_8 = _mm_packs_epi32(i32_4, _mm_setzero_si128());
202 hilet u8_16 = _mm_packus_epi16(i16_8, _mm_setzero_si128());
203 v = numeric_array{u8_16};
204 return;
205 } else if constexpr (is_i64x4 and other.is_i32x4) {
206 v = numeric_array{_mm_cvtepi32_epi64(other.reg())};
207 return;
208 } else if constexpr (is_i64x4 and other.is_i16x8) {
209 v = numeric_array{_mm_cvtepi16_epi64(other.reg())};
210 return;
211 } else if constexpr (is_i32x4 and other.is_i16x8) {
212 v = numeric_array{_mm_cvtepi16_epi32(other.reg())};
213 return;
214 } else if constexpr (is_i64x2 and other.is_i8x16) {
215 v = numeric_array{_mm_cvtepi8_epi64(other.reg())};
216 return;
217 } else if constexpr (is_i32x4 and other.is_i8x16) {
218 v = numeric_array{_mm_cvtepi8_epi32(other.reg())};
219 return;
220 } else if constexpr (is_i16x8 and other.is_i8x16) {
221 v = numeric_array{_mm_cvtepi8_epi16(other.reg())};
222 return;
223 } else if constexpr (is_f16x4 and other.is_f32x4) {
224 v = numeric_array{_mm_cvtps_ph_sse4_1(other.reg())};
225 return;
226 } else if constexpr (is_f32x4 and other.is_f16x4) {
227 v = numeric_array{_mm_cvtph_ps_sse2(other.reg())};
228 return;
229 }
230
231#endif
232#if defined(HI_HAS_SSE2)
233 if constexpr (is_f64x2 and other.is_i32x4) {
234 v = numeric_array{_mm_cvtepi32_pd(other.reg())};
235 return;
236 } else if constexpr (is_f32x4 and other.is_i32x4) {
237 v = numeric_array{_mm_cvtepi32_ps(other.reg())};
238 return;
239 } else if constexpr (is_i32x4 and other.is_f32x4) {
240 v = numeric_array{_mm_cvtps_epi32(other.reg())};
241 return;
242 }
243#endif
244 }
245
246 for (std::size_t i = 0; i != N; ++i) {
247 if (i < M) {
248 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
249 // SSE conversion round floats before converting to integer.
250 v[i] = static_cast<value_type>(std::round(other[i]));
251 } else {
252 v[i] = static_cast<value_type>(other[i]);
253 }
254 } else {
255 v[i] = T{};
256 }
257 }
258 }
259
260 template<numeric_limited U, std::size_t M>
261 [[nodiscard]] constexpr explicit numeric_array(numeric_array<U, M> const& other1, numeric_array<U, M> const& other2) noexcept
262 :
263 v()
264 {
265 if (!std::is_constant_evaluated()) {
266#if defined(HI_HAS_AVX)
267 if constexpr (is_f64x4 and other1.is_f64x2 and other2.is_f64x2) {
268 v = numeric_array{_mm256_set_m128d(other2.reg(), other1.reg())};
269 } else if constexpr (is_f32x8 and other1.is_f32x4 and other2.is_f32x4) {
270 v = numeric_array{_mm256_set_m128(other2.reg(), other1.reg())};
271 } else if constexpr (
272 std::is_integral_v<T> and std::is_integral_v<U> and (sizeof(T) * N == 32) and (sizeof(U) * M == 16)) {
273 v = numeric_array{_mm256_set_m128i(other2.reg(), other1.reg())};
274 }
275#endif
276#if defined(HI_HAS_SSE4_1)
277 if constexpr (is_u16x8 and other1.is_u32x4 and other2.is_u32x4) {
278 v = numeric_array{_mm_packus_epu32(other2.reg(), other1.reg())};
279 }
280#endif
281#if defined(HI_HAS_SSE2)
282 if constexpr (is_i16x8 and other1.is_i32x4 and other2.is_i32x4) {
283 v = numeric_array{_mm_packs_epi32(other2.reg(), other1.reg())};
284 } else if constexpr (is_i8x16 and other1.is_i16x8 and other2.is_i16x8) {
285 v = numeric_array{_mm_packs_epi16(other2.reg(), other1.reg())};
286 } else if constexpr (is_u8x16 and other1.is_u16x8 and other2.is_u16x8) {
287 v = numeric_array{_mm_packus_epu16(other2.reg(), other1.reg())};
288 }
289#endif
290 }
291
292 for (std::size_t i = 0; i != N; ++i) {
293 if (i < M) {
294 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
295 // SSE conversion round floats before converting to integer.
296 v[i] = static_cast<value_type>(std::round(other1[i]));
297 } else {
298 v[i] = static_cast<value_type>(other1[i]);
299 }
300 } else if (i < M * 2) {
301 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
302 // SSE conversion round floats before converting to integer.
303 v[i] = static_cast<value_type>(std::round(other2[i - M]));
304 } else {
305 v[i] = static_cast<value_type>(other2[i - M]);
306 }
307 } else {
308 v[i] = U{};
309 }
310 }
311 }
312
313 [[nodiscard]] constexpr explicit numeric_array(T const& x) noexcept : v()
314 {
315 if (not std::is_constant_evaluated()) {
316#if defined(HI_HAS_SSE)
317 if constexpr (is_f32x4) {
318 *this = numeric_array{_mm_set_ss(x)};
319 return;
320 }
321#endif
322 }
323 get<0>(v) = x;
324 }
325
326 [[nodiscard]] constexpr explicit numeric_array(T const& x, T const& y) noexcept requires(N >= 2) : v()
327 {
328 if (not std::is_constant_evaluated()) {
329#if defined(HI_HAS_SSE2)
330 if constexpr (is_i32x4) {
331 *this = numeric_array{_mm_set_epi32(0, 0, y, x)};
332 return;
333 }
334#endif
335 }
336 get<0>(v) = x;
337 get<1>(v) = y;
338 }
339
340 [[nodiscard]] constexpr explicit numeric_array(T const& x, T const& y, T const& z) noexcept requires(N >= 3) : v()
341 {
342 if (not std::is_constant_evaluated()) {
343#if defined(HI_HAS_SSE2)
344 if constexpr (is_i32x4) {
345 *this = numeric_array{_mm_set_epi32(0, z, y, x)};
346 return;
347 }
348#endif
349 }
350 get<0>(v) = x;
351 get<1>(v) = y;
352 get<2>(v) = z;
353 }
354
355 [[nodiscard]] constexpr explicit numeric_array(T const& x, T const& y, T const& z, T const& w) noexcept requires(N >= 4) : v()
356 {
357 if (not std::is_constant_evaluated()) {
358#if defined(HI_HAS_SSE2)
359 if constexpr (is_i32x4) {
360 *this = numeric_array{_mm_set_epi32(w, z, y, x)};
361 return;
362 }
363#endif
364 }
365 get<0>(v) = x;
366 get<1>(v) = y;
367 get<2>(v) = z;
368 get<3>(v) = w;
369 }
370
371 [[nodiscard]] static constexpr numeric_array broadcast(T rhs) noexcept
372 {
373 if (not std::is_constant_evaluated()) {
374#if defined(HI_HAS_AVX)
375 if constexpr (is_f64x4) {
376 return numeric_array{_mm256_set1_pd(rhs)};
377 } else if constexpr (is_f32x8) {
378 return numeric_array{_mm256_set1_ps(rhs)};
379 } else if constexpr (is_i64x4) {
380 return numeric_array{_mm256_set1_epi64x(rhs)};
381 } else if constexpr (is_i32x8) {
382 return numeric_array{_mm256_set1_epi32(rhs)};
383 } else if constexpr (is_i16x16) {
384 return numeric_array{_mm256_set1_epi16(rhs)};
385 } else if constexpr (is_i8x32) {
386 return numeric_array{_mm256_set1_epi8(rhs)};
387 }
388#endif
389#if defined(HI_HAS_SSE2)
390 if constexpr (is_f64x2) {
391 return numeric_array{_mm_set1_pd(rhs)};
392 } else if constexpr (is_i64x2) {
393 return numeric_array{_mm_set1_epi64x(rhs)};
394 } else if constexpr (is_i32x4) {
395 return numeric_array{_mm_set1_epi32(rhs)};
396 } else if constexpr (is_i16x8) {
397 return numeric_array{_mm_set1_epi16(rhs)};
398 } else if constexpr (is_i8x16) {
399 return numeric_array{_mm_set1_epi8(rhs)};
400 }
401#endif
402#if defined(HI_HAS_SSE)
403 if constexpr (is_f32x4) {
404 return numeric_array{_mm_set1_ps(rhs)};
405 }
406#endif
407 }
408 auto r = numeric_array{};
409 for (std::size_t i = 0; i != N; ++i) {
410 r[i] = rhs;
411 }
412 return r;
413 }
414
415 [[nodiscard]] static constexpr numeric_array epsilon() noexcept
416 {
417 if constexpr (std::is_floating_point_v<T>) {
418 return broadcast(std::numeric_limits<T>::min());
419 } else {
420 return broadcast(T{0});
421 }
422 }
423
424 [[nodiscard]] numeric_array(std::array<T, N> const& rhs) noexcept : v(rhs) {}
425
426 numeric_array& operator=(std::array<T, N> const& rhs) noexcept
427 {
428 v = rhs;
429 return *this;
430 }
431
432 [[nodiscard]] operator std::array<T, N>() const noexcept
433 {
434 return v;
435 }
436
437#if defined(HI_HAS_SSE2)
438 [[nodiscard]] __m128i reg() const noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
439 {
440 return _mm_loadu_si128(reinterpret_cast<__m128i const *>(v.data()));
441 }
442
443 [[nodiscard]] __m128i reg() const noexcept requires(is_f16x4)
444 {
445 return _mm_set_epi16(0, 0, 0, 0, get<3>(v).get(), get<2>(v).get(), get<1>(v).get(), get<0>(v).get());
446 }
447#endif
448
449#if defined(HI_HAS_SSE2)
450 [[nodiscard]] __m128 reg() const noexcept requires(is_f32x4)
451 {
452 return _mm_loadu_ps(v.data());
453 }
454#endif
455
456#if defined(HI_HAS_SSE2)
457 [[nodiscard]] __m128d reg() const noexcept requires(is_f64x2)
458 {
459 return _mm_loadu_pd(v.data());
460 }
461#endif
462
463#if defined(HI_HAS_SSE2)
464 [[nodiscard]] explicit numeric_array(__m128i const& rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
465 {
466 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), rhs);
467 }
468#endif
469
470#if defined(HI_HAS_SSE4_1)
471 [[nodiscard]] explicit numeric_array(__m128i const& rhs) noexcept requires(is_f16x4) :
472 v(std::bit_cast<decltype(v)>(_mm_extract_epi64(rhs, 0)))
473 {
474 }
475#endif
476
477#if defined(HI_HAS_SSE4_1)
478 [[nodiscard]] explicit numeric_array(__m128i const& rhs) noexcept requires(is_u8x4) :
479 v(std::bit_cast<decltype(v)>(_mm_extract_epi32(rhs, 0)))
480 {
481 }
482#endif
483
484#if defined(HI_HAS_SSE2)
485 [[nodiscard]] explicit numeric_array(__m128 const& rhs) noexcept requires(is_f32x4)
486 {
487 _mm_storeu_ps(v.data(), rhs);
488 }
489#endif
490
491#if defined(HI_HAS_SSE2)
492 [[nodiscard]] explicit numeric_array(__m128d const& rhs) noexcept requires(is_f64x2)
493 {
494 _mm_storeu_pd(v.data(), rhs);
495 }
496#endif
497
498#if defined(HI_HAS_SSE2)
499 numeric_array& operator=(__m128i const& rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
500 {
501 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), rhs);
502 return *this;
503 }
504#endif
505
506#if defined(HI_HAS_SSE2)
507 numeric_array& operator=(__m128 const& rhs) noexcept requires(is_f32x4)
508 {
509 _mm_storeu_ps(v.data(), rhs);
510 return *this;
511 }
512#endif
513
514#if defined(HI_HAS_SSE2)
515 numeric_array& operator=(__m128d const& rhs) noexcept requires(is_f64x2)
516 {
517 _mm_storeu_pd(v.data(), rhs);
518 return *this;
519 }
520#endif
521
522#if defined(HI_HAS_AVX)
523 [[nodiscard]] __m256i reg() const noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
524 {
525 return _mm256_loadu_si256(reinterpret_cast<__m256i const *>(v.data()));
526 }
527#endif
528
529#if defined(HI_HAS_AVX)
530 [[nodiscard]] __m256 reg() const noexcept requires(is_f32x8)
531 {
532 return _mm256_loadu_ps(v.data());
533 }
534#endif
535
536#if defined(HI_HAS_AVX)
537 [[nodiscard]] __m256d reg() const noexcept requires(is_f64x4)
538 {
539 return _mm256_loadu_pd(v.data());
540 }
541#endif
542
543#if defined(HI_HAS_AVX)
544 [[nodiscard]] explicit numeric_array(__m256i const& rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
545 {
546 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), rhs);
547 }
548#endif
549
550#if defined(HI_HAS_AVX)
551 [[nodiscard]] explicit numeric_array(__m256 const& rhs) noexcept requires(is_f32x8)
552 {
553 _mm256_storeu_ps(v.data(), rhs);
554 }
555#endif
556
557#if defined(HI_HAS_AVX)
558 [[nodiscard]] explicit numeric_array(__m256d const& rhs) noexcept requires(is_f64x4)
559 {
560 _mm256_storeu_pd(v.data(), rhs);
561 }
562#endif
563
564#if defined(HI_HAS_AVX)
565 numeric_array& operator=(__m256i const& rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
566 {
567 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), rhs);
568 return *this;
569 }
570#endif
571
572#if defined(HI_HAS_AVX)
573 numeric_array& operator=(__m256 const& rhs) noexcept requires(is_f32x8)
574 {
575 _mm256_storeu_ps(v.data(), rhs);
576 return *this;
577 }
578#endif
579
580#if defined(HI_HAS_AVX)
581 numeric_array& operator=(__m256d const& rhs) noexcept requires(is_f64x4)
582 {
583 _mm256_storeu_pd(v.data(), rhs);
584 return *this;
585 }
586#endif
587
588 template<typename Other>
589 [[nodiscard]] constexpr friend Other bit_cast(numeric_array const& rhs) noexcept
590 requires(sizeof(Other) == sizeof(container_type))
591 {
592 if (not std::is_constant_evaluated()) {
593#if defined(HI_HAS_SSE2)
594 if constexpr (Other::is_f32x4 and std::is_integral_v<T>) {
595 return Other{_mm_castsi128_ps(rhs.reg())};
596 } else if constexpr (Other::is_f32x4 and is_f64x2) {
597 return Other{_mm_castpd_ps(rhs.reg())};
598 } else if constexpr (Other::is_f64x2 and std::is_integral_v<T>) {
599 return Other{_mm_castsi128_pd(rhs.reg())};
600 } else if constexpr (Other::is_f64x2 and is_f32x4) {
601 return Other{_mm_castps_pd(rhs.reg())};
602 } else if constexpr (std::is_integral_v<typename Other::value_type> and is_f32x4) {
603 return Other{_mm_castps_si128(rhs.reg())};
604 } else if constexpr (std::is_integral_v<typename Other::value_type> and is_f64x2) {
605 return Other{_mm_castpd_si128(rhs.reg())};
606 } else if constexpr (std::is_integral_v<typename Other::value_type> and std::is_integral_v<T>) {
607 return Other{rhs.reg()};
608 }
609#endif
610 }
611 return std::bit_cast<Other>(rhs);
612 }
613
617 {
618 if (not std::is_constant_evaluated()) {
619#if defined(HI_HAS_SSE2)
620 if constexpr (is_f64x2) {
621 return numeric_array{_mm_unpacklo_pd(a.reg(), b.reg())};
622 } else if constexpr (is_i64x2 or is_u64x2) {
623 return numeric_array{_mm_unpacklo_epi64(a.reg(), b.reg())};
624 } else if constexpr (is_i32x4 or is_u32x4) {
625 return numeric_array{_mm_unpacklo_epi32(a.reg(), b.reg())};
626 } else if constexpr (is_i16x8 or is_u16x8) {
627 return numeric_array{_mm_unpacklo_epi16(a.reg(), b.reg())};
628 } else if constexpr (is_i8x16 or is_u8x16) {
629 return numeric_array{_mm_unpacklo_epi8(a.reg(), b.reg())};
630 }
631#endif
632#if defined(HI_HAS_SSE)
633 if constexpr (is_f32x4) {
634 return numeric_array{_mm_unpacklo_ps(a.reg(), b.reg())};
635 }
636#endif
637 }
638
639 auto r = numeric_array{};
640 for (std::size_t i = 0; i != N; ++i) {
641 r[i] = (i % 2 == 0) ? a[i / 2] : b[i / 2];
642 }
643 return r;
644 }
645
650 template<std::size_t S>
651 [[nodiscard]] static constexpr numeric_array load(std::byte const *ptr) noexcept
652 {
653 auto r = numeric_array{};
654 std::memcpy(&r, ptr, S);
655 return r;
656 }
657
662 [[nodiscard]] static constexpr numeric_array load(std::byte const *ptr) noexcept
663 {
664 auto r = numeric_array{};
665 std::memcpy(&r, ptr, sizeof(r));
666 return r;
667 }
668
673 [[nodiscard]] static constexpr numeric_array load(T const *ptr) noexcept
674 {
675 auto r = numeric_array{};
676 std::memcpy(&r, ptr, sizeof(r));
677 return r;
678 }
679
680 template<std::size_t S>
681 constexpr void store(std::byte *ptr) const noexcept
682 {
683 std::memcpy(ptr, this, S);
684 }
685
689 constexpr void store(std::byte *ptr) const noexcept
690 {
691 store<sizeof(*this)>(ptr);
692 }
693
697 constexpr explicit operator bool() const noexcept
698 {
699 if constexpr (std::is_floating_point_v<T>) {
700 hilet ep = epsilon();
701 // check if any of the elements is outside of epsilon range,
702 return to_bool(gt(-ep, *this) | gt(*this, ep));
703 } else {
704 return to_bool(ne(*this, T{0}));
705 }
706 }
707
708 [[nodiscard]] constexpr T const& operator[](std::size_t i) const noexcept
709 {
710 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
711 hi_axiom(i < N);
712 return v[i];
713 }
714
715 [[nodiscard]] constexpr T& operator[](std::size_t i) noexcept
716 {
717 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
718 hi_axiom(i < N);
719 return v[i];
720 }
721
722 [[nodiscard]] constexpr reference front() noexcept
723 {
724 return v.front();
725 }
726
727 [[nodiscard]] constexpr const_reference front() const noexcept
728 {
729 return v.front();
730 }
731
732 [[nodiscard]] constexpr reference back() noexcept
733 {
734 return v.back();
735 }
736
737 [[nodiscard]] constexpr const_reference back() const noexcept
738 {
739 return v.back();
740 }
741
742 [[nodiscard]] constexpr pointer data() noexcept
743 {
744 return v.data();
745 }
746
747 [[nodiscard]] constexpr const_pointer data() const noexcept
748 {
749 return v.data();
750 }
751
752 [[nodiscard]] constexpr iterator begin() noexcept
753 {
754 return v.begin();
755 }
756
757 [[nodiscard]] constexpr const_iterator begin() const noexcept
758 {
759 return v.begin();
760 }
761
762 [[nodiscard]] constexpr const_iterator cbegin() const noexcept
763 {
764 return v.cbegin();
765 }
766
767 [[nodiscard]] constexpr iterator end() noexcept
768 {
769 return v.end();
770 }
771
772 [[nodiscard]] constexpr const_iterator end() const noexcept
773 {
774 return v.end();
775 }
776
777 [[nodiscard]] constexpr const_iterator cend() const noexcept
778 {
779 return v.cend();
780 }
781
782 [[nodiscard]] constexpr bool empty() const noexcept
783 {
784 return v.empty();
785 }
786
787 [[nodiscard]] constexpr size_type size() const noexcept
788 {
789 return v.size();
790 }
791
792 [[nodiscard]] constexpr size_type max_size() const noexcept
793 {
794 return v.max_size();
795 }
796
797 constexpr bool is_point() const noexcept
798 {
799 return v.back() != T{};
800 }
801
802 constexpr bool is_vector() const noexcept
803 {
804 return v.back() == T{};
805 }
806
807 constexpr bool is_opaque() const noexcept
808 {
809 return a() == T{1};
810 }
811
812 constexpr bool is_transparent() const noexcept
813 {
814 return a() == T{0};
815 }
816
817 [[nodiscard]] constexpr T const& x() const noexcept requires(N >= 1)
818 {
819 return std::get<0>(v);
820 }
821
822 [[nodiscard]] constexpr T const& y() const noexcept requires(N >= 2)
823 {
824 return std::get<1>(v);
825 }
826
827 [[nodiscard]] constexpr T const& z() const noexcept requires(N >= 3)
828 {
829 return std::get<2>(v);
830 }
831
832 [[nodiscard]] constexpr T const& w() const noexcept requires(N >= 4)
833 {
834 return std::get<3>(v);
835 }
836
837 [[nodiscard]] constexpr T& x() noexcept requires(N >= 1)
838 {
839 return std::get<0>(v);
840 }
841
842 [[nodiscard]] constexpr T& y() noexcept requires(N >= 2)
843 {
844 return std::get<1>(v);
845 }
846
847 [[nodiscard]] constexpr T& z() noexcept requires(N >= 3)
848 {
849 return std::get<2>(v);
850 }
851
852 [[nodiscard]] constexpr T& w() noexcept requires(N >= 4)
853 {
854 return std::get<3>(v);
855 }
856
857 [[nodiscard]] constexpr T const& r() const noexcept requires(N >= 1)
858 {
859 return std::get<0>(v);
860 }
861
862 [[nodiscard]] constexpr T const& g() const noexcept requires(N >= 2)
863 {
864 return std::get<1>(v);
865 }
866
867 [[nodiscard]] constexpr T const& b() const noexcept requires(N >= 3)
868 {
869 return std::get<2>(v);
870 }
871
872 [[nodiscard]] constexpr T const& a() const noexcept requires(N >= 4)
873 {
874 return std::get<3>(v);
875 }
876
877 [[nodiscard]] constexpr T& r() noexcept requires(N >= 1)
878 {
879 return std::get<0>(v);
880 }
881
882 [[nodiscard]] constexpr T& g() noexcept requires(N >= 2)
883 {
884 return std::get<1>(v);
885 }
886
887 [[nodiscard]] constexpr T& b() noexcept requires(N >= 3)
888 {
889 return std::get<2>(v);
890 }
891
892 [[nodiscard]] constexpr T& a() noexcept requires(N >= 4)
893 {
894 return std::get<3>(v);
895 }
896
897 [[nodiscard]] constexpr T const& width() const noexcept requires(N >= 1)
898 {
899 return std::get<0>(v);
900 }
901
902 [[nodiscard]] constexpr T const& height() const noexcept requires(N >= 2)
903 {
904 return std::get<1>(v);
905 }
906
907 [[nodiscard]] constexpr T const& depth() const noexcept requires(N >= 3)
908 {
909 return std::get<2>(v);
910 }
911
912 [[nodiscard]] constexpr T& width() noexcept requires(N >= 1)
913 {
914 return std::get<0>(v);
915 }
916
917 [[nodiscard]] constexpr T& height() noexcept requires(N >= 2)
918 {
919 return std::get<1>(v);
920 }
921
922 [[nodiscard]] constexpr T& depth() noexcept requires(N >= 3)
923 {
924 return std::get<2>(v);
925 }
926
927 constexpr numeric_array& operator<<=(unsigned int rhs) noexcept
928 {
929 return *this = *this << rhs;
930 }
931
932 constexpr numeric_array& operator>>=(unsigned int rhs) noexcept
933 {
934 return *this = *this >> rhs;
935 }
936
937 constexpr numeric_array& operator|=(numeric_array const& rhs) noexcept
938 {
939 return *this = *this | rhs;
940 }
941
942 constexpr numeric_array& operator|=(T const& rhs) noexcept
943 {
944 return *this = *this | rhs;
945 }
946
947 constexpr numeric_array& operator&=(numeric_array const& rhs) noexcept
948 {
949 return *this = *this & rhs;
950 }
951
952 constexpr numeric_array& operator&=(T const& rhs) noexcept
953 {
954 return *this = *this & rhs;
955 }
956
957 constexpr numeric_array& operator^=(numeric_array const& rhs) noexcept
958 {
959 return *this = *this ^ rhs;
960 }
961
962 constexpr numeric_array& operator^=(T const& rhs) noexcept
963 {
964 return *this = *this ^ rhs;
965 }
966
967 constexpr numeric_array& operator+=(numeric_array const& rhs) noexcept
968 {
969 return *this = *this + rhs;
970 }
971
972 constexpr numeric_array& operator+=(T const& rhs) noexcept
973 {
974 return *this = *this + rhs;
975 }
976
977 constexpr numeric_array& operator-=(numeric_array const& rhs) noexcept
978 {
979 return *this = *this - rhs;
980 }
981
982 constexpr numeric_array& operator-=(T const& rhs) noexcept
983 {
984 return *this = *this - rhs;
985 }
986
987 constexpr numeric_array& operator*=(numeric_array const& rhs) noexcept
988 {
989 return *this = *this * rhs;
990 }
991
992 constexpr numeric_array& operator*=(T const& rhs) noexcept
993 {
994 return *this = *this * rhs;
995 }
996
997 constexpr numeric_array& operator/=(numeric_array const& rhs) noexcept
998 {
999 return *this = *this / rhs;
1000 }
1001
1002 constexpr numeric_array& operator/=(T const& rhs) noexcept
1003 {
1004 return *this = *this / rhs;
1005 }
1006
1007 constexpr numeric_array& operator%=(numeric_array const& rhs) noexcept
1008 {
1009 return *this = *this % rhs;
1010 }
1011
1012 constexpr numeric_array& operator%=(T const& rhs) noexcept
1013 {
1014 return *this = *this % rhs;
1015 }
1016
1017 constexpr static ssize_t get_zero = -1;
1018 constexpr static ssize_t get_one = -2;
1019
1024 template<std::size_t I>
1025 [[nodiscard]] friend constexpr T& get(numeric_array& rhs) noexcept
1026 {
1027 static_assert(I < N, "Index out of bounds");
1028 return std::get<I>(rhs.v);
1029 }
1030
1036 template<ssize_t I>
1037 [[nodiscard]] friend constexpr T get(numeric_array&& rhs) noexcept
1038 {
1039 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
1040 static_assert(I >= -2 && I < narrow_cast<ssize_t>(N), "Index out of bounds");
1041 if constexpr (I == get_zero) {
1042 return T{0};
1043 } else if constexpr (I == get_one) {
1044 return T{1};
1045 } else {
1046 return std::get<I>(rhs.v);
1047 }
1048 }
1049
1056 template<std::size_t I>
1057 [[nodiscard]] constexpr friend T extract(numeric_array const& rhs) noexcept
1058 {
1059 static_assert(I < N);
1060
1061 if (not std::is_constant_evaluated()) {
1062#if defined(HI_HAS_AVX2)
1063 if constexpr (is_i16x16 or is_u16x16) {
1064 return static_cast<T>(_mm256_extract_epi16(rhs.v.reg(), I));
1065 } else if constexpr (is_i8x32 or is_u8x32) {
1066 return static_cast<T>(_mm256_extract_epi8(rhs.v.reg(), I));
1067 }
1068#endif
1069#if defined(HI_HAS_AVX)
1070 if constexpr (is_f64x4) {
1071 return bit_cast<T>(_mm256_extract_epi64(_mm256_castpd_si256(rhs.v.reg()), I));
1072 } else if constexpr (is_f32x8) {
1073 return bit_cast<T>(_mm256_extract_epi32(_mm256_castps_si256(rhs.v.reg()), I));
1074 } else if constexpr (is_i64x4 or is_u64x4) {
1075 return static_cast<T>(_mm256_extract_epi64(rhs.v.reg(), I));
1076 } else if constexpr (is_i32x8 or is_u32x8) {
1077 return static_cast<T>(_mm256_extract_epi32(rhs.v.reg(), I));
1078 }
1079#endif
1080#if defined(HI_HAS_SSE4_1)
1081 if constexpr (is_f64x2) {
1082 return bit_cast<T>(_mm_extract_epi64(_mm_castpd_si128(rhs.v.reg()), I));
1083 } else if constexpr (is_f32x4) {
1084 return std::bit_cast<T>(_mm_extract_ps(rhs.v.reg(), I));
1085 } else if constexpr (is_i64x2 or is_u64x2) {
1086 return static_cast<T>(_mm_extract_epi64(rhs.v.reg(), I));
1087 } else if constexpr (is_i32x4 or is_u32x4) {
1088 return static_cast<T>(_mm_extract_epi32(rhs.v.reg(), I));
1089 } else if constexpr (is_i8x16 or is_u8x16) {
1090 return static_cast<T>(_mm_extract_epi8(rhs.v.reg(), I));
1091 }
1092#endif
1093#if defined(HI_HAS_SSE2)
1094 if constexpr (is_i16x8 or is_u16x8) {
1095 return static_cast<T>(_mm_extract_epi16(rhs.v.reg(), I));
1096 }
1097#endif
1098 }
1099
1100 return get<I>(rhs);
1101 }
1102
1111 template<std::size_t I, std::size_t ZeroMask = 0>
1112 [[nodiscard]] constexpr friend numeric_array insert(numeric_array const& lhs, T rhs) noexcept
1113 requires(is_f32x4 or is_i32x4 or is_u32x4)
1114 {
1115 static_assert(I < N);
1116 static_assert(ZeroMask <= ((1 << N) - 1));
1117
1118 if (not std::is_constant_evaluated()) {
1119#if defined(HI_HAS_SSE4_1)
1120 if constexpr (is_f32x4) {
1121 constexpr int imm8 = (I << 4) | ZeroMask;
1122 return numeric_array{_mm_insert_ps(lhs.reg(), _mm_set_ss(rhs), imm8)};
1123 } else if constexpr (is_i32x4 or is_u32x4) {
1124 constexpr int imm8 = (I << 4) | ZeroMask;
1125 return numeric_array{
1126 _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(_mm_set1_epi32(rhs)), imm8))};
1127 }
1128#endif
1129 }
1130
1131 auto r = lhs;
1132 std::get<I>(r.v) = rhs;
1133 for (std::size_t i = 0; i != N; ++i) {
1134 if ((ZeroMask >> i) & 1) {
1135 r.v[i] = T{};
1136 }
1137 }
1138 return r;
1139 }
1140
1146 template<ssize_t I>
1147 [[nodiscard]] friend constexpr T get(numeric_array const& rhs) noexcept
1148 {
1149 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
1150 static_assert(I >= -2 && I < narrow_cast<ssize_t>(N), "Index out of bounds");
1151 if constexpr (I == get_zero) {
1152 return T{0};
1153 } else if constexpr (I == get_one) {
1154 return T{1};
1155 } else {
1156 return std::get<I>(rhs.v);
1157 }
1158 }
1159
1164 template<std::size_t Mask = ~std::size_t{0}>
1165 [[nodiscard]] friend constexpr numeric_array zero(numeric_array rhs) noexcept
1166 {
1167 if (not std::is_constant_evaluated()) {
1168#if defined(HI_HAS_SSE4_1)
1169 if constexpr (is_f32x4) {
1170 return numeric_array{_mm_insert_ps(rhs.reg(), rhs.reg(), Mask)};
1171 } else if constexpr (is_i32x4 or is_u32x4) {
1172 return numeric_array{
1173 _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(rhs.reg()), _mm_castsi128_ps(rhs.reg()), Mask))};
1174 }
1175#endif
1176 }
1177
1178 auto r = numeric_array{};
1179 for (std::size_t i = 0; i != N; ++i) {
1180 if (to_bool((Mask >> i) & 1)) {
1181 r.v[i] = T{0};
1182 } else {
1183 r.v[i] = rhs.v[i];
1184 }
1185 }
1186 return r;
1187 }
1188
1196 template<std::size_t Mask>
1197 [[nodiscard]] friend constexpr numeric_array blend(numeric_array const& lhs, numeric_array const& rhs) noexcept
1198 {
1199 if (not std::is_constant_evaluated()) {
1200#if defined(HI_HAS_AVX2)
1201 if constexpr (is_i32x8) {
1202 return numeric_array{_mm256_blend_epi32(lhs.reg(), rhs.reg(), Mask)};
1203 } else if constexpr (is_i64x2 or is_u64x2) {
1204 constexpr auto mask_x2 = ((Mask & 1) ? 0b0011 : 0) | ((Mask & 2) ? 0b1100 : 0);
1205 return numeric_array{_mm_blend_epi32(lhs.reg(), rhs.reg(), mask_x2)};
1206 } else if constexpr (is_i32x4 or is_u32x4) {
1207 return numeric_array{_mm_blend_epi32(lhs.reg(), rhs.reg(), Mask)};
1208 } else if constexpr (is_i16x16 or is_u16x16) {
1209 return numeric_array{_mm256_blend_epi16(lhs.reg(), rhs.reg(), Mask)};
1210 }
1211#endif
1212#if defined(HI_HAS_AVX)
1213 if constexpr (is_f64x4) {
1214 return numeric_array{_mm256_blend_pd(lhs.reg(), rhs.reg(), Mask)};
1215 } else if constexpr (is_f32x8) {
1216 return numeric_array{_mm256_blend_ps(lhs.reg(), rhs.reg(), Mask)};
1217 } else if constexpr (is_i64x4 or is_u64x4) {
1218 return numeric_array{
1219 _mm256_castpd_si256(_mm256_blend_pd(_mm256_castsi256_pd(lhs.reg()), _mm256_castsi256_pd(rhs.reg()), Mask))};
1220 } else if constexpr (is_i32x8 or is_u32x8) {
1221 return numeric_array{
1222 _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg()), Mask))};
1223 }
1224#endif
1225#if defined(HI_HAS_SSE4_1)
1226 if constexpr (is_f64x2) {
1227 return numeric_array{_mm_blend_pd(lhs.reg(), rhs.reg(), Mask)};
1228 } else if constexpr (is_f32x4) {
1229 return numeric_array{_mm_blend_ps(lhs.reg(), rhs.reg(), Mask)};
1230 } else if constexpr (is_i64x2 or is_u64x2) {
1231 return numeric_array{
1232 _mm_castpd_si128(_mm_blend_pd(_mm_castsi128_pd(lhs.reg()), _mm_castsi128_pd(rhs.reg()), Mask))};
1233 } else if constexpr (is_i32x4 or is_u32x4) {
1234 return numeric_array{
1235 _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg()), Mask))};
1236 } else if constexpr (is_i16x8 or is_u16x8) {
1237 return numeric_array{_mm_blend_epi16(lhs.reg(), rhs.reg(), Mask)};
1238 }
1239#endif
1240 }
1241
1242 auto r = numeric_array{};
1243 for (std::size_t i = 0; i != N; ++i) {
1244 r[i] = to_bool((Mask >> i) & 1) ? rhs[i] : lhs[i];
1245 }
1246 return r;
1247 }
1248
1251 [[nodiscard]] friend constexpr numeric_array blend(numeric_array const& a, numeric_array const& b, numeric_array const& mask)
1252 {
1253 if (not std::is_constant_evaluated()) {
1254#if defined(HI_HAS_AVX2)
1255 if constexpr (is_i8x32 or is_u8x32) {
1256 return numeric_array{_mm256_blendv_epi8(a.reg(), b.reg(), mask.reg())};
1257 }
1258#endif
1259#if defined(HI_HAS_AVX)
1260 if constexpr (is_f64x4) {
1261 return numeric_array{_mm256_blendv_pd(a.reg(), b.reg(), mask.reg())};
1262 } else if constexpr (is_f32x8) {
1263 return numeric_array{_mm256_blendv_ps(a.reg(), b.reg(), mask.reg())};
1264 } else if constexpr (is_i64x4 or is_u64x4) {
1265 return numeric_array{_mm256_castpd_si256(_mm256_blendv_pd(
1266 _mm256_castsi256_pd(a.reg()), _mm256_castsi256_pd(b.reg()), _mm256_castsi256_pd(mask.reg())))};
1267 } else if constexpr (is_i32x8 or is_u32x8) {
1268 return numeric_array{_mm256_castps_si256(_mm256_blendv_ps(
1269 _mm256_castsi256_ps(a.reg()), _mm256_castsi256_ps(b.reg()), _mm256_castsi256_ps(mask.reg())))};
1270 }
1271#endif
1272#if defined(HI_HAS_SSE4_1)
1273 if constexpr (is_f64x2) {
1274 return numeric_array{_mm_blendv_pd(a.reg(), b.reg(), mask.reg())};
1275 } else if constexpr (is_f32x4) {
1276 return numeric_array{_mm_blendv_ps(a.reg(), b.reg(), mask.reg())};
1277 } else if constexpr (is_i64x2 or is_u64x2) {
1278 return numeric_array{_mm_castpd_si128(
1279 _mm_blendv_pd(_mm_castsi128_pd(a.reg()), _mm_castsi128_pd(b.reg()), _mm_castsi128_pd(mask.reg())))};
1280 } else if constexpr (is_i32x4 or is_u32x4) {
1281 return numeric_array{_mm_castps_si128(
1282 _mm_blendv_ps(_mm_castsi128_ps(a.reg()), _mm_castsi128_ps(b.reg()), _mm_castsi128_ps(mask.reg())))};
1283 } else if constexpr (is_i8x16 or is_u8x16) {
1284 return numeric_array{_mm_blendv_epi8(a.reg(), b.reg(), mask.reg())};
1285 }
1286#endif
1287 }
1288
1289 auto r = numeric_array{};
1290 for (std::size_t i = 0; i != N; ++i) {
1291 r[i] = mask[i] != T{0} ? b[i] : a[i];
1292 }
1293 return r;
1294 }
1295
1300 template<std::size_t Mask>
1301 [[nodiscard]] friend constexpr numeric_array neg(numeric_array rhs) noexcept
1302 {
1303 return blend<Mask>(rhs, -rhs);
1304 }
1305
1306 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const& rhs) noexcept
1307 {
1308 return T{0} - rhs;
1309 }
1310
1311 [[nodiscard]] friend constexpr numeric_array abs(numeric_array const& rhs) noexcept
1312 {
1313 if (not std::is_constant_evaluated()) {
1314#if defined(HI_HAS_AVX2)
1315 if constexpr (is_i32x8) {
1316 return numeric_array{_mm256_abs_epi32(rhs.reg())};
1317 } else if constexpr (is_i16x16) {
1318 return numeric_array{_mm256_abs_epi16(rhs.reg())};
1319 } else if constexpr (is_i8x32) {
1320 return numeric_array{_mm256_abs_epi8(rhs.reg())};
1321 }
1322#endif
1323#if defined(HI_HAS_SSSE3)
1324 if constexpr (is_i32x4) {
1325 return numeric_array{_mm_abs_epi32(rhs.reg())};
1326 } else if constexpr (is_i16x8) {
1327 return numeric_array{_mm_abs_epi16(rhs.reg())};
1328 } else if constexpr (is_i8x16) {
1329 return numeric_array{_mm_abs_epi8(rhs.reg())};
1330 }
1331#endif
1332#if defined(HI_HAS_SSE2)
1333 if constexpr (is_f64x2) {
1334 return numeric_array{_mm_castsi128_ps(_mm_srli_epi64(_mm_slli_epi64(_mm_castpd_si128(rhs.reg()), 1), 1))};
1335 } else if constexpr (is_f32x4) {
1336 return numeric_array{_mm_castsi128_ps(_mm_srli_epi32(_mm_slli_epi32(_mm_castps_si128(rhs.reg()), 1), 1))};
1337 }
1338#endif
1339 }
1340
1341 return max(rhs, -rhs);
1342 }
1343
1344 [[nodiscard]] friend constexpr numeric_array rcp(numeric_array const& rhs) noexcept
1345 {
1346 if (not std::is_constant_evaluated()) {
1347#if defined(HI_HAS_AVX)
1348 if constexpr (is_f32x8) {
1349 return numeric_array{_mm256_rcp_ps(rhs.reg())};
1350 }
1351#endif
1352#if defined(HI_HAS_SSE)
1353 if constexpr (is_f32x4) {
1354 return numeric_array{_mm_rcp_ps(rhs.reg())};
1355 }
1356#endif
1357 }
1358
1359 return T{1} / rhs;
1360 }
1361
1362 [[nodiscard]] friend constexpr numeric_array sqrt(numeric_array const& rhs) noexcept
1363 {
1364 if (not std::is_constant_evaluated()) {
1365#if defined(HI_HAS_AVX)
1366 if constexpr (is_f64x4) {
1367 return numeric_array{_mm256_sqrt_pd(rhs.reg())};
1368 } else if constexpr (is_f32x8) {
1369 return numeric_array{_mm256_sqrt_ps(rhs.reg())};
1370 }
1371#endif
1372#if defined(HI_HAS_SSE2)
1373 if constexpr (is_f64x2) {
1374 return numeric_array{_mm_sqrt_pd(rhs.reg())};
1375 }
1376#endif
1377#if defined(HI_HAS_SSE)
1378 if constexpr (is_f32x4) {
1379 return numeric_array{_mm_sqrt_ps(rhs.reg())};
1380 }
1381#endif
1382 }
1383
1384 auto r = numeric_array{};
1385 for (std::size_t i = 0; i != N; ++i) {
1386 r[i] = std::sqrt(rhs.v[i]);
1387 }
1388 return r;
1389 }
1390
1391 [[nodiscard]] friend constexpr numeric_array rcp_sqrt(numeric_array const& rhs) noexcept
1392 {
1393 if (not std::is_constant_evaluated()) {
1394#if defined(HI_HAS_AVX)
1395 if constexpr (is_f32x8) {
1396 return numeric_array{_mm256_rsqrt_ps(rhs.reg())};
1397 }
1398#endif
1399#if defined(HI_HAS_SSE)
1400 if constexpr (is_f32x4) {
1401 return numeric_array{_mm_rsqrt_ps(rhs.reg())};
1402 }
1403#endif
1404 }
1405
1406 return rcp(sqrt(rhs));
1407 }
1408
1409 [[nodiscard]] friend constexpr numeric_array floor(numeric_array const& rhs) noexcept
1410 requires(std::is_floating_point_v<value_type>)
1411 {
1412 if (not std::is_constant_evaluated()) {
1413#if defined(HI_HAS_AVX)
1414 if constexpr (is_f64x4) {
1415 return numeric_array{_mm256_floor_pd(rhs.reg())};
1416 } else if constexpr (is_f32x8) {
1417 return numeric_array{_mm256_floor_ps(rhs.reg())};
1418 }
1419#endif
1420#if defined(HI_HAS_SSE4_1)
1421 if constexpr (is_f64x2) {
1422 return numeric_array{_mm_floor_pd(rhs.reg())};
1423 } else if constexpr (is_f32x4) {
1424 return numeric_array{_mm_floor_ps(rhs.reg())};
1425 }
1426#endif
1427 }
1428
1429 auto r = numeric_array{};
1430 for (std::size_t i = 0; i != N; ++i) {
1431 r[i] = std::floor(rhs.v[i]);
1432 }
1433 return r;
1434 }
1435
1436 [[nodiscard]] friend constexpr numeric_array ceil(numeric_array const& rhs) noexcept
1437 requires(std::is_floating_point_v<value_type>)
1438 {
1439 if (not std::is_constant_evaluated()) {
1440#if defined(HI_HAS_AVX)
1441 if constexpr (is_f64x4) {
1442 return numeric_array{_mm256_ceil_pd(rhs.reg())};
1443 } else if constexpr (is_f32x8) {
1444 return numeric_array{_mm256_ceil_ps(rhs.reg())};
1445 }
1446#endif
1447#if defined(HI_HAS_SSE4_1)
1448 if constexpr (is_f64x2) {
1449 return numeric_array{_mm_ceil_pd(rhs.reg())};
1450 } else if constexpr (is_f32x4) {
1451 return numeric_array{_mm_ceil_ps(rhs.reg())};
1452 }
1453#endif
1454 }
1455
1456 auto r = numeric_array{};
1457 for (std::size_t i = 0; i != N; ++i) {
1458 r[i] = std::ceil(rhs.v[i]);
1459 }
1460 return r;
1461 }
1462
1463 [[nodiscard]] friend constexpr numeric_array round(numeric_array const& rhs) noexcept
1464 requires(std::is_floating_point_v<value_type>)
1465 {
1466 if (not std::is_constant_evaluated()) {
1467#if defined(HI_HAS_AVX)
1468 if constexpr (is_f64x4) {
1469 return numeric_array{_mm256_round_pd(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1470 } else if constexpr (is_f32x8) {
1471 return numeric_array{_mm256_round_ps(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1472 }
1473#endif
1474#if defined(HI_HAS_SSE4_1)
1475 if constexpr (is_f64x2) {
1476 return numeric_array{_mm_round_pd(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1477 } else if constexpr (is_f32x4) {
1478 return numeric_array{_mm_round_ps(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1479 }
1480#endif
1481 }
1482
1483 auto r = numeric_array{};
1484 for (std::size_t i = 0; i != N; ++i) {
1485 r[i] = std::round(rhs.v[i]);
1486 }
1487 return r;
1488 }
1489
1497 template<std::size_t Mask>
1498 [[nodiscard]] hi_force_inline friend constexpr T dot(numeric_array const& lhs, numeric_array const& rhs) noexcept
1499 {
1500 if (not std::is_constant_evaluated()) {
1501#if defined(HI_HAS_SSE4_1)
1502 if constexpr (is_f64x2) {
1503 return std::bit_cast<double>(_mm_extract_epi64(_mm_dp_pd(lhs.reg(), rhs.reg(), (Mask << 4) | 0xf), 0));
1504 } else if constexpr (is_f32x4) {
1505 return std::bit_cast<float>(_mm_extract_ps(_mm_dp_ps(lhs.reg(), rhs.reg(), (Mask << 4) | 0xf), 0));
1506 }
1507#endif
1508 }
1509
1510 auto r = T{};
1511 for (std::size_t i = 0; i != N; ++i) {
1512 if (to_bool(Mask & (1_uz << i))) {
1513 r += lhs.v[i] * rhs.v[i];
1514 }
1515 }
1516 return r;
1517 }
1518
1526 template<std::size_t Mask>
1527 [[nodiscard]] friend constexpr T hypot(numeric_array const& rhs) noexcept
1528 {
1529 return std::sqrt(dot<Mask>(rhs, rhs));
1530 }
1531
1539 template<std::size_t Mask>
1540 [[nodiscard]] hi_force_inline friend constexpr T squared_hypot(numeric_array const& rhs) noexcept
1541 {
1542 return dot<Mask>(rhs, rhs);
1543 }
1544
1551 template<std::size_t Mask>
1552 [[nodiscard]] friend constexpr T rcp_hypot(numeric_array const& rhs) noexcept
1553 {
1554 if (not std::is_constant_evaluated()) {
1555#if defined(HI_HAS_SSE4_1)
1556 if constexpr (is_f32x4) {
1557 return std::bit_cast<float>(_mm_extract_ps(_mm_rsqrt_ps(_mm_dp_ps(rhs.reg(), rhs.reg(), (Mask << 4) | 0xf)), 0));
1558 }
1559#endif
1560 }
1561
1562 return 1.0f / hypot<Mask>(rhs);
1563 }
1564
1573 template<std::size_t Mask>
1574 [[nodiscard]] friend constexpr numeric_array normalize(numeric_array const& rhs) noexcept
1575 {
1576 hi_axiom(rhs.is_vector());
1577
1578 if (not std::is_constant_evaluated()) {
1579#if defined(HI_HAS_SSE4_1)
1580 if constexpr (is_f32x4) {
1581 hilet rhs_ = rhs.reg();
1582 hilet tmp = _mm_mul_ps(_mm_rsqrt_ps(_mm_dp_ps(rhs_, rhs_, (Mask << 4) | 0xf)), rhs_);
1583 return numeric_array{_mm_insert_ps(tmp, tmp, ~Mask & 0xf)};
1584 }
1585#endif
1586 }
1587
1588 hilet rcp_hypot_ = rcp_hypot<Mask>(rhs);
1589
1590 auto r = numeric_array{};
1591 for (std::size_t i = 0; i != N; ++i) {
1592 if (to_bool(Mask & (1_uz << i))) {
1593 r.v[i] = rhs.v[i] * rcp_hypot_;
1594 }
1595 }
1596 return r;
1597 }
1598
1599 [[nodiscard]] friend constexpr std::size_t eq(numeric_array const& lhs, numeric_array const& rhs) noexcept
1600 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1601 {
1602 if (not std::is_constant_evaluated()) {
1603#if defined(HI_HAS_AVX2)
1604 if constexpr (is_i64x4 or is_u64x4) {
1605 return static_cast<std::size_t>(
1606 _mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(lhs.reg(), rhs.reg()))));
1607 } else if constexpr (is_i32x8 or is_u32x8) {
1608 return static_cast<std::size_t>(
1609 _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(lhs.reg(), rhs.reg()))));
1610 } else if constexpr (is_i8x32 or is_u8x32) {
1611 return static_cast<std::size_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(lhs.reg(), rhs.reg())));
1612 }
1613#endif
1614#if defined(HI_HAS_AVX)
1615 if constexpr (is_f64x4) {
1616 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_EQ_OQ)));
1617 } else if constexpr (is_f32x8) {
1618 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_EQ_OQ)));
1619 }
1620#endif
1621#if defined(HI_HAS_SSE4_1)
1622 if constexpr (is_i64x2 or is_u64x2) {
1623 return static_cast<std::size_t>(_mm_movemask_pd(_mm_castsi128_pd(_mm_cmpeq_epi64(lhs.reg(), rhs.reg()))));
1624 }
1625#endif
1626#if defined(HI_HAS_SSE2)
1627 if constexpr (is_f64x2) {
1628 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpeq_pd(lhs.reg(), rhs.reg())));
1629 } else if constexpr (is_i32x4 or is_u32x4) {
1630 return static_cast<std::size_t>(_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs.reg(), rhs.reg()))));
1631 } else if constexpr (is_i8x16 or is_u8x16) {
1632 return static_cast<std::size_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(lhs.reg(), rhs.reg())));
1633 }
1634#endif
1635#if defined(HI_HAS_SSE)
1636 if constexpr (is_f32x4) {
1637 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpeq_ps(lhs.reg(), rhs.reg())));
1638 }
1639#endif
1640 }
1641
1642 std::size_t r = 0;
1643 for (std::size_t i = 0; i != N; ++i) {
1644 r |= static_cast<std::size_t>(lhs.v[i] == rhs.v[i]) << i;
1645 }
1646 return r;
1647 }
1648
1649 [[nodiscard]] friend constexpr std::size_t ne(numeric_array const& lhs, numeric_array const& rhs) noexcept
1650 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1651 {
1652 if (not std::is_constant_evaluated()) {
1653#if defined(HI_HAS_AVX)
1654 if constexpr (is_f64x4) {
1655 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_NEQ_OQ)));
1656 } else if constexpr (is_f32x8) {
1657 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_NEQ_OQ)));
1658 }
1659#endif
1660#if defined(HI_HAS_SSE2)
1661 if constexpr (is_f64x2) {
1662 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpneq_pd(lhs.reg(), rhs.reg())));
1663 }
1664#endif
1665#if defined(HI_HAS_SSE)
1666 if constexpr (is_f32x4) {
1667 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpneq_ps(lhs.reg(), rhs.reg())));
1668 }
1669#endif
1670 }
1671
1672 constexpr std::size_t not_mask = (1 << N) - 1;
1673 return eq(lhs, rhs) ^ not_mask;
1674 }
1675
1676 [[nodiscard]] friend constexpr std::size_t gt(numeric_array const& lhs, numeric_array const& rhs) noexcept
1677 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1678 {
1679 if (not std::is_constant_evaluated()) {
1680#if defined(HI_HAS_AVX2)
1681 if constexpr (is_i64x4) {
1682 return static_cast<std::size_t>(
1683 _mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpgt_epi64(lhs.reg(), rhs.reg()))));
1684 } else if constexpr (is_i32x8) {
1685 return static_cast<std::size_t>(
1686 _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(lhs.reg(), rhs.reg()))));
1687 } else if constexpr (is_i8x32) {
1688 return static_cast<std::size_t>(_mm256_movemask_epi8(_mm256_cmpgt_epi8(lhs.reg(), rhs.reg())));
1689 }
1690#endif
1691#if defined(HI_HAS_AVX)
1692 if constexpr (is_f64x4) {
1693 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_GT_OQ)));
1694 } else if constexpr (is_f32x8) {
1695 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_GT_OQ)));
1696 }
1697#endif
1698#if defined(HI_HAS_SSE4_1)
1699 if constexpr (is_i64x2) {
1700 return static_cast<std::size_t>(_mm_movemask_pd(_mm_castsi128_pd(_mm_cmpgt_epi64(lhs.reg(), rhs.reg()))));
1701 }
1702#endif
1703#if defined(HI_HAS_SSE2)
1704 if constexpr (is_f64x2) {
1705 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpgt_pd(lhs.reg(), rhs.reg())));
1706 } else if constexpr (is_i32x4) {
1707 return static_cast<std::size_t>(_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(lhs.reg(), rhs.reg()))));
1708 } else if constexpr (is_i8x16) {
1709 return static_cast<std::size_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(lhs.reg(), rhs.reg())));
1710 }
1711#endif
1712#if defined(HI_HAS_SSE)
1713 if constexpr (is_f32x4) {
1714 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpgt_ps(lhs.reg(), rhs.reg())));
1715 }
1716#endif
1717 }
1718
1719 unsigned int r = 0;
1720 for (std::size_t i = 0; i != N; ++i) {
1721 r |= static_cast<std::size_t>(lhs.v[i] > rhs.v[i]) << i;
1722 }
1723 return r;
1724 }
1725
1726 [[nodiscard]] friend constexpr std::size_t lt(numeric_array const& lhs, numeric_array const& rhs) noexcept
1727 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1728 {
1729 if (not std::is_constant_evaluated()) {
1730#if defined(HI_HAS_AVX)
1731 if constexpr (is_f64x4) {
1732 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_LT_OQ)));
1733 } else if constexpr (is_f32x8) {
1734 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_LT_OQ)));
1735 }
1736#endif
1737#if defined(HI_HAS_SSE2)
1738 if constexpr (is_f64x2) {
1739 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmplt_pd(lhs.reg(), rhs.reg())));
1740 } else if constexpr (is_i32x4) {
1741 return static_cast<std::size_t>(_mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(lhs.reg(), rhs.reg()))));
1742 } else if constexpr (is_i8x16) {
1743 return static_cast<std::size_t>(_mm_movemask_epi8(_mm_cmplt_epi8(lhs.reg(), rhs.reg())));
1744 }
1745#endif
1746#if defined(HI_HAS_SSE)
1747 if constexpr (is_f32x4) {
1748 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmplt_ps(lhs.reg(), rhs.reg())));
1749 }
1750#endif
1751 }
1752
1753 // gt() and eq() has best x64 support.
1754 return gt(rhs, lhs);
1755 }
1756
1757 [[nodiscard]] friend constexpr std::size_t ge(numeric_array const& lhs, numeric_array const& rhs) noexcept
1758 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1759 {
1760 if (not std::is_constant_evaluated()) {
1761#if defined(HI_HAS_AVX)
1762 if constexpr (is_f64x4) {
1763 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_GE_OQ)));
1764 } else if constexpr (is_f32x8) {
1765 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_GE_OQ)));
1766 }
1767#endif
1768#if defined(HI_HAS_SSE2)
1769 if constexpr (is_f64x2) {
1770 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpge_pd(lhs.reg(), rhs.reg())));
1771 }
1772#endif
1773#if defined(HI_HAS_SSE)
1774 if constexpr (is_f32x4) {
1775 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpge_ps(lhs.reg(), rhs.reg())));
1776 }
1777#endif
1778 }
1779
1780 // gt() and eq() has best x64 support.
1781 return gt(lhs, rhs) | eq(lhs, rhs);
1782 }
1783
1784 [[nodiscard]] friend constexpr std::size_t le(numeric_array const& lhs, numeric_array const& rhs) noexcept
1785 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1786 {
1787 if (not std::is_constant_evaluated()) {
1788#if defined(HI_HAS_AVX)
1789 if constexpr (is_f64x4) {
1790 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_LE_OQ)));
1791 } else if constexpr (is_f32x8) {
1792 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_LE_OQ)));
1793 }
1794#endif
1795#if defined(HI_HAS_SSE2)
1796 if constexpr (is_f64x2) {
1797 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmple_pd(lhs.reg(), rhs.reg())));
1798 }
1799#endif
1800#if defined(HI_HAS_SSE)
1801 if constexpr (is_f32x4) {
1802 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmple_ps(lhs.reg(), rhs.reg())));
1803 }
1804#endif
1805 }
1806
1807 // gt() and eq() has best x64 support.
1808 return gt(rhs, lhs) | eq(rhs, lhs);
1809 }
1810
1811 [[nodiscard]] friend constexpr numeric_array gt_mask(numeric_array const& lhs, numeric_array const& rhs) noexcept
1812 {
1813 if (not std::is_constant_evaluated()) {
1814#if defined(HI_HAS_SSE4_2)
1815 if constexpr (is_i64x2) {
1816 return numeric_array{_mm_cmpgt_epi64(lhs.reg(), rhs.reg())};
1817 }
1818#endif
1819#if defined(HI_HAS_SSE2)
1820 if constexpr (is_i32x4) {
1821 return numeric_array{_mm_cmpgt_epi32(lhs.reg(), rhs.reg())};
1822 } else if constexpr (is_i16x8) {
1823 return numeric_array{_mm_cmpgt_epi16(lhs.reg(), rhs.reg())};
1824 } else if constexpr (is_i8x16) {
1825 return numeric_array{_mm_cmpgt_epi8(lhs.reg(), rhs.reg())};
1826 }
1827#endif
1828#if defined(HI_HAS_SSE)
1829 if constexpr (is_f32x4) {
1830 return numeric_array{_mm_cmpgt_ps(lhs.reg(), rhs.reg())};
1831 }
1832#endif
1833 }
1834
1835 using uint_type = make_uintxx_t<sizeof(T) * CHAR_BIT>;
1836 constexpr auto ones = std::bit_cast<T>(~uint_type{0});
1837
1838 auto r = numeric_array{};
1839 for (std::size_t i = 0; i != N; ++i) {
1840 r[i] = lhs.v[i] > rhs.v[i] ? ones : T{0};
1841 }
1842 return r;
1843 }
1844
1845 [[nodiscard]] friend constexpr bool operator==(numeric_array const& lhs, numeric_array const& rhs) noexcept
1846 {
1847 return not ne(lhs, rhs);
1848 }
1849
1850 [[nodiscard]] friend constexpr numeric_array operator<<(numeric_array const& lhs, unsigned int rhs) noexcept
1851 {
1852 if (not std::is_constant_evaluated()) {
1853#if defined(HI_HAS_AVX2)
1854 if constexpr (is_f64x4) {
1855 return numeric_array{_mm256_castsi256_pd(_mm256_slli_epi64(_mm256_castpd_si256(lhs.reg()), rhs))};
1856 } else if constexpr (is_f32x8) {
1857 return numeric_array{_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(lhs.reg()), rhs))};
1858 } else if constexpr (is_i64x4 or is_u64x4) {
1859 return numeric_array{_mm256_slli_epi64(lhs.reg(), rhs)};
1860 } else if constexpr (is_i32x8 or is_u32x8) {
1861 return numeric_array{_mm256_slli_epi32(lhs.reg(), rhs)};
1862 } else if constexpr (is_i16x16 or is_u16x16) {
1863 return numeric_array{_mm256_slli_epi16(lhs.reg(), rhs)};
1864 }
1865#endif
1866#if defined(HI_HAS_SSE2)
1867 if constexpr (is_f64x2) {
1868 return numeric_array{_mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(lhs.reg()), rhs))};
1869 } else if constexpr (is_f32x4) {
1870 return numeric_array{_mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(lhs.reg()), rhs))};
1871 } else if constexpr (is_i64x2 or is_u64x2) {
1872 return numeric_array{_mm_slli_epi64(lhs.reg(), rhs)};
1873 } else if constexpr (is_i32x4 or is_u32x4) {
1874 return numeric_array{_mm_slli_epi32(lhs.reg(), rhs)};
1875 } else if constexpr (is_i16x8 or is_u16x8) {
1876 return numeric_array{_mm_slli_epi16(lhs.reg(), rhs)};
1877 }
1878#endif
1879 }
1880
1881 auto r = numeric_array{};
1882 for (std::size_t i = 0; i != N; ++i) {
1883 r.v[i] = lhs.v[i] << rhs;
1884 }
1885 return r;
1886 }
1887
1888 [[nodiscard]] friend constexpr numeric_array operator>>(numeric_array const& lhs, unsigned int rhs) noexcept
1889 {
1890 if (not std::is_constant_evaluated()) {
1891#if defined(HI_HAS_AVX2)
1892 if constexpr (is_f64x4) {
1893 return numeric_array{_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_castpd_si256(lhs.reg()), rhs))};
1894 } else if constexpr (is_f32x8) {
1895 return numeric_array{_mm256_castsi256_ps(_mm256_srli_epi32(_mm256_castps_si256(lhs.reg()), rhs))};
1896 } else if constexpr (is_u64x4) {
1897 return numeric_array{_mm256_srli_epi64(lhs.reg(), rhs)};
1898 } else if constexpr (is_i32x8) {
1899 return numeric_array{_mm256_srai_epi32(lhs.reg(), rhs)};
1900 } else if constexpr (is_u32x8) {
1901 return numeric_array{_mm256_srli_epi32(lhs.reg(), rhs)};
1902 } else if constexpr (is_i16x16) {
1903 return numeric_array{_mm256_srai_epi16(lhs.reg(), rhs)};
1904 } else if constexpr (is_u16x16) {
1905 return numeric_array{_mm256_srli_epi16(lhs.reg(), rhs)};
1906 }
1907#endif
1908#if defined(HI_HAS_SSE2)
1909 if constexpr (is_f64x2) {
1910 return numeric_array{_mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(lhs.reg()), rhs))};
1911 } else if constexpr (is_f32x4) {
1912 return numeric_array{_mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(lhs.reg()), rhs))};
1913 } else if constexpr (is_u64x2) {
1914 return numeric_array{_mm_srli_epi64(lhs.reg(), rhs)};
1915 } else if constexpr (is_i32x4) {
1916 return numeric_array{_mm_srai_epi32(lhs.reg(), rhs)};
1917 } else if constexpr (is_u32x4) {
1918 return numeric_array{_mm_srli_epi32(lhs.reg(), rhs)};
1919 } else if constexpr (is_i16x8) {
1920 return numeric_array{_mm_srai_epi16(lhs.reg(), rhs)};
1921 } else if constexpr (is_u16x8) {
1922 return numeric_array{_mm_srli_epi16(lhs.reg(), rhs)};
1923 }
1924#endif
1925 }
1926
1927 auto r = numeric_array{};
1928 for (std::size_t i = 0; i != N; ++i) {
1929 r.v[i] = lhs.v[i] >> rhs;
1930 }
1931 return r;
1932 }
1933
1938 [[nodiscard]] friend constexpr numeric_array rotl(numeric_array const& lhs, unsigned int rhs) noexcept
1939 {
1940 hi_axiom(rhs > 0 and rhs < sizeof(value_type) * CHAR_BIT);
1941
1942 hilet remainder = narrow<unsigned int>(sizeof(value_type) * CHAR_BIT - rhs);
1943
1944 return (lhs << rhs) | (lhs >> remainder);
1945 }
1946
1951 [[nodiscard]] friend constexpr numeric_array rotr(numeric_array const& lhs, unsigned int rhs) noexcept
1952 {
1953 hi_axiom(rhs > 0 and rhs < sizeof(value_type) * CHAR_BIT);
1954
1955 hilet remainder = narrow<unsigned int>(sizeof(value_type) * CHAR_BIT - rhs);
1956
1957 return (lhs >> rhs) | (lhs << remainder);
1958 }
1959
1960 [[nodiscard]] friend constexpr numeric_array operator|(numeric_array const& lhs, numeric_array const& rhs) noexcept
1961 {
1962 if (not std::is_constant_evaluated()) {
1963#if defined(HI_HAS_AVX2)
1964 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
1965 return numeric_array{_mm256_or_si256(lhs.reg(), rhs.reg())};
1966 }
1967#endif
1968#if defined(HI_HAS_AVX)
1969 if constexpr (is_f64x4) {
1970 return numeric_array{_mm256_or_pd(lhs.reg(), rhs.reg())};
1971 } else if constexpr (is_f32x8) {
1972 return numeric_array{_mm256_or_ps(lhs.reg(), rhs.reg())};
1973 } else if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
1974 return numeric_array{
1975 _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg())))};
1976 }
1977#endif
1978#if defined(HI_HAS_SSE2)
1979 if constexpr (is_f64x2) {
1980 return numeric_array{_mm_or_pd(lhs.reg(), rhs.reg())};
1981 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
1982 return numeric_array{_mm_or_si128(lhs.reg(), rhs.reg())};
1983 }
1984#endif
1985#if defined(HI_HAS_SSE)
1986 if constexpr (is_f64x2) {
1987 return numeric_array{_mm_castps_pd(_mm_or_ps(_mm_castps_ps(lhs.reg()), _mm_castps_ps(rhs.reg())))};
1988
1989 } else if constexpr (is_f32x4) {
1990 return numeric_array{_mm_or_ps(lhs.reg(), rhs.reg())};
1991
1992 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
1993 return numeric_array{_mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg())))};
1994 }
1995#endif
1996 }
1997
1998 using uint_type = make_uintxx_t<sizeof(T) * CHAR_BIT>;
1999
2000 auto r = numeric_array{};
2001 for (std::size_t i = 0; i != N; ++i) {
2002 r.v[i] =
2003 std::bit_cast<T>(static_cast<uint_type>(std::bit_cast<uint_type>(lhs.v[i]) | std::bit_cast<uint_type>(rhs.v[i])));
2004 }
2005 return r;
2006 }
2007
2008 [[nodiscard]] friend constexpr numeric_array operator|(numeric_array const& lhs, T const& rhs) noexcept
2009 {
2010 return lhs | broadcast(rhs);
2011 }
2012
2013 [[nodiscard]] friend constexpr numeric_array operator|(T const& lhs, numeric_array const& rhs) noexcept
2014 {
2015 return broadcast(lhs) | rhs;
2016 }
2017
2018 [[nodiscard]] friend constexpr numeric_array operator&(numeric_array const& lhs, numeric_array const& rhs) noexcept
2019 {
2020 if (not std::is_constant_evaluated()) {
2021#if defined(HI_HAS_AVX2)
2022 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2023 return numeric_array{_mm256_and_si256(lhs.reg(), rhs.reg())};
2024 }
2025#endif
2026#if defined(HI_HAS_AVX)
2027 if constexpr (is_f64x4) {
2028 return numeric_array{_mm256_and_pd(lhs.reg(), rhs.reg())};
2029 } else if constexpr (is_f32x8) {
2030 return numeric_array{_mm256_and_ps(lhs.reg(), rhs.reg())};
2031 } else if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2032 return numeric_array{
2033 _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg())))};
2034 }
2035#endif
2036#if defined(HI_HAS_SSE2)
2037 if constexpr (is_f64x2) {
2038 return numeric_array{_mm_and_pd(lhs.reg(), rhs.reg())};
2039 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2040 return numeric_array{_mm_and_si128(lhs.reg(), rhs.reg())};
2041 }
2042#endif
2043#if defined(HI_HAS_SSE)
2044 if constexpr (is_f64x2) {
2045 return numeric_array{_mm_castps_pd(_mm_and_ps(_mm_castps_ps(lhs.reg()), _mm_castps_ps(rhs.reg())))};
2046
2047 } else if constexpr (is_f32x4) {
2048 return numeric_array{_mm_and_ps(lhs.reg(), rhs.reg())};
2049
2050 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2051 return numeric_array{_mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg())))};
2052 }
2053#endif
2054 }
2055
2056 auto r = numeric_array{};
2057 for (std::size_t i = 0; i != N; ++i) {
2058 r.v[i] = lhs.v[i] & rhs.v[i];
2059 }
2060 return r;
2061 }
2062
2063 [[nodiscard]] friend constexpr numeric_array operator&(numeric_array const& lhs, T const& rhs) noexcept
2064 {
2065 return lhs & broadcast(rhs);
2066 }
2067
2068 [[nodiscard]] friend constexpr numeric_array operator&(T const& lhs, numeric_array const& rhs) noexcept
2069 {
2070 return broadcast(lhs) & rhs;
2071 }
2072
2073 [[nodiscard]] friend constexpr numeric_array operator^(numeric_array const& lhs, numeric_array const& rhs) noexcept
2074 {
2075 if (not std::is_constant_evaluated()) {
2076#if defined(HI_HAS_AVX2)
2077 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2078 return numeric_array{_mm256_xor_si256(lhs.reg(), rhs.reg())};
2079 }
2080#endif
2081#if defined(HI_HAS_AVX)
2082 if constexpr (is_f64x4) {
2083 return numeric_array{_mm256_xor_pd(lhs.reg(), rhs.reg())};
2084 } else if constexpr (is_f32x8) {
2085 return numeric_array{_mm256_xor_ps(lhs.reg(), rhs.reg())};
2086 } else if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2087 return numeric_array{
2088 _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg())))};
2089 }
2090#endif
2091#if defined(HI_HAS_SSE2)
2092 if constexpr (is_f64x2) {
2093 return numeric_array{_mm_xor_pd(lhs.reg(), rhs.reg())};
2094 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2095 return numeric_array{_mm_xor_si128(lhs.reg(), rhs.reg())};
2096 }
2097#endif
2098#if defined(HI_HAS_SSE)
2099 if constexpr (is_f64x2) {
2100 return numeric_array{_mm_castps_pd(_mm_xor_ps(_mm_castps_ps(lhs.reg()), _mm_castps_ps(rhs.reg())))};
2101
2102 } else if constexpr (is_f32x4) {
2103 return numeric_array{_mm_xor_ps(lhs.reg(), rhs.reg())};
2104
2105 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2106 return numeric_array{_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg())))};
2107 }
2108#endif
2109 }
2110
2111 auto r = numeric_array{};
2112 for (std::size_t i = 0; i != N; ++i) {
2113 r.v[i] = lhs.v[i] ^ rhs.v[i];
2114 }
2115 return r;
2116 }
2117
2118 [[nodiscard]] friend constexpr numeric_array operator^(numeric_array const& lhs, T const& rhs) noexcept
2119 {
2120 return lhs ^ broadcast(rhs);
2121 }
2122
2123 [[nodiscard]] friend constexpr numeric_array operator^(T const& lhs, numeric_array const& rhs) noexcept
2124 {
2125 return broadcast(lhs) ^ rhs;
2126 }
2127
2128 [[nodiscard]] friend constexpr numeric_array operator+(numeric_array const& lhs, numeric_array const& rhs) noexcept
2129 {
2130 if (not std::is_constant_evaluated()) {
2131#if defined(HI_HAS_AVX2)
2132 if constexpr (is_i64x4 or is_u64x4) {
2133 return numeric_array{_mm256_add_epi64(lhs.reg(), rhs.reg())};
2134 } else if constexpr (is_i32x8 or is_u32x8) {
2135 return numeric_array{_mm256_add_epi32(lhs.reg(), rhs.reg())};
2136 } else if constexpr (is_i16x16 or is_u16x16) {
2137 return numeric_array{_mm256_add_epi16(lhs.reg(), rhs.reg())};
2138 } else if constexpr (is_i8x32 or is_u8x32) {
2139 return numeric_array{_mm256_add_epi8(lhs.reg(), rhs.reg())};
2140 }
2141#endif
2142#if defined(HI_HAS_AVX)
2143 if constexpr (is_f64x4) {
2144 return numeric_array{_mm256_add_pd(lhs.reg(), rhs.reg())};
2145 } else if constexpr (is_f32x8) {
2146 return numeric_array{_mm256_add_ps(lhs.reg(), rhs.reg())};
2147 }
2148#endif
2149#if defined(HI_HAS_SSE2)
2150 if constexpr (is_f64x2) {
2151 return numeric_array{_mm_add_pd(lhs.reg(), rhs.reg())};
2152 } else if constexpr (is_i64x2 or is_u64x2) {
2153 return numeric_array{_mm_add_epi64(lhs.reg(), rhs.reg())};
2154 } else if constexpr (is_i32x4 or is_u32x4) {
2155 return numeric_array{_mm_add_epi32(lhs.reg(), rhs.reg())};
2156 } else if constexpr (is_i16x8 or is_u16x8) {
2157 return numeric_array{_mm_add_epi16(lhs.reg(), rhs.reg())};
2158 } else if constexpr (is_i8x16 or is_u8x16) {
2159 return numeric_array{_mm_add_epi8(lhs.reg(), rhs.reg())};
2160 }
2161#endif
2162#if defined(HI_HAS_SSE)
2163 if constexpr (is_f32x4) {
2164 return numeric_array{_mm_add_ps(lhs.reg(), rhs.reg())};
2165 }
2166#endif
2167 }
2168
2169 auto r = numeric_array{};
2170 for (std::size_t i = 0; i != N; ++i) {
2171 r.v[i] = lhs.v[i] + rhs.v[i];
2172 }
2173 return r;
2174 }
2175
2176 [[nodiscard]] friend constexpr numeric_array operator+(numeric_array const& lhs, T const& rhs) noexcept
2177 {
2178 return lhs + broadcast(rhs);
2179 }
2180
2181 [[nodiscard]] friend constexpr numeric_array operator+(T const& lhs, numeric_array const& rhs) noexcept
2182 {
2183 return broadcast(lhs) + rhs;
2184 }
2185
2186 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const& lhs, numeric_array const& rhs) noexcept
2187 {
2188 if (not std::is_constant_evaluated()) {
2189#if defined(HI_HAS_AVX2)
2190 if constexpr (is_i64x4 or is_u64x4) {
2191 return numeric_array{_mm256_sub_epi64(lhs.reg(), rhs.reg())};
2192 } else if constexpr (is_i32x8 or is_u32x8) {
2193 return numeric_array{_mm256_sub_epi32(lhs.reg(), rhs.reg())};
2194 } else if constexpr (is_i16x16 or is_u16x16) {
2195 return numeric_array{_mm256_sub_epi16(lhs.reg(), rhs.reg())};
2196 } else if constexpr (is_i8x32 or is_u8x32) {
2197 return numeric_array{_mm256_sub_epi8(lhs.reg(), rhs.reg())};
2198 }
2199#endif
2200#if defined(HI_HAS_AVX)
2201 if constexpr (is_f64x4) {
2202 return numeric_array{_mm256_sub_pd(lhs.reg(), rhs.reg())};
2203 } else if constexpr (is_f32x8) {
2204 return numeric_array{_mm256_sub_ps(lhs.reg(), rhs.reg())};
2205 }
2206#endif
2207#if defined(HI_HAS_SSE2)
2208 if constexpr (is_f64x2) {
2209 return numeric_array{_mm_sub_pd(lhs.reg(), rhs.reg())};
2210 } else if constexpr (is_i64x2 or is_u64x2) {
2211 return numeric_array{_mm_sub_epi64(lhs.reg(), rhs.reg())};
2212 } else if constexpr (is_i32x4 or is_u32x4) {
2213 return numeric_array{_mm_sub_epi32(lhs.reg(), rhs.reg())};
2214 } else if constexpr (is_i16x8 or is_u16x8) {
2215 return numeric_array{_mm_sub_epi16(lhs.reg(), rhs.reg())};
2216 } else if constexpr (is_i8x16 or is_u8x16) {
2217 return numeric_array{_mm_sub_epi8(lhs.reg(), rhs.reg())};
2218 }
2219#endif
2220#if defined(HI_HAS_SSE)
2221 if constexpr (is_f32x4) {
2222 return numeric_array{_mm_sub_ps(lhs.reg(), rhs.reg())};
2223 }
2224#endif
2225 }
2226
2227 auto r = numeric_array{};
2228 for (std::size_t i = 0; i != N; ++i) {
2229 r.v[i] = lhs.v[i] - rhs.v[i];
2230 }
2231 return r;
2232 }
2233
2234 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const& lhs, T const& rhs) noexcept
2235 {
2236 return lhs - broadcast(rhs);
2237 }
2238
2239 [[nodiscard]] friend constexpr numeric_array operator-(T const& lhs, numeric_array const& rhs) noexcept
2240 {
2241 return broadcast(lhs) - rhs;
2242 }
2243
2244 [[nodiscard]] friend constexpr numeric_array operator*(numeric_array const& lhs, numeric_array const& rhs) noexcept
2245 {
2246 if (not std::is_constant_evaluated()) {
2247#if defined(HI_HAS_AVX2)
2248 if constexpr (is_i32x8) {
2249 return numeric_array{_mm256_mul_epi32(lhs.reg(), rhs.reg())};
2250 } else if constexpr (is_u32x8) {
2251 return numeric_array{_mm256_mul_epu32(lhs.reg(), rhs.reg())};
2252 }
2253#endif
2254#if defined(HI_HAS_AVX)
2255 if constexpr (is_f64x4) {
2256 return numeric_array{_mm256_mul_pd(lhs.reg(), rhs.reg())};
2257 } else if constexpr (is_f32x8) {
2258 return numeric_array{_mm256_mul_ps(lhs.reg(), rhs.reg())};
2259 }
2260#endif
2261#if defined(HI_HAS_SSE4_1)
2262 if constexpr (is_i32x4) {
2263 return numeric_array{_mm_mul_epi32(lhs.reg(), rhs.reg())};
2264 } else if constexpr (is_f16x4) {
2265 return numeric_array{numeric_array<float, 4>{lhs} * numeric_array<float, 4>{rhs}};
2266 }
2267#endif
2268#if defined(HI_HAS_SSE2)
2269 if constexpr (is_f64x2) {
2270 return numeric_array{_mm_mul_pd(lhs.reg(), rhs.reg())};
2271 }
2272#endif
2273#if defined(HI_HAS_SSE)
2274 if constexpr (is_f32x4) {
2275 return numeric_array{_mm_mul_ps(lhs.reg(), rhs.reg())};
2276 }
2277#endif
2278 }
2279
2280 auto r = numeric_array{};
2281 for (std::size_t i = 0; i != N; ++i) {
2282 r.v[i] = lhs.v[i] * rhs.v[i];
2283 }
2284 return r;
2285 }
2286
2287 [[nodiscard]] friend constexpr numeric_array operator*(numeric_array const& lhs, T const& rhs) noexcept
2288 {
2289 return lhs * broadcast(rhs);
2290 }
2291
2292 [[nodiscard]] friend constexpr numeric_array operator*(T const& lhs, numeric_array const& rhs) noexcept
2293 {
2294 return broadcast(lhs) * rhs;
2295 }
2296
2297 [[nodiscard]] friend constexpr numeric_array operator/(numeric_array const& lhs, numeric_array const& rhs) noexcept
2298 {
2299 if (not std::is_constant_evaluated()) {
2300#if defined(HI_HAS_AVX)
2301 if constexpr (is_f64x4) {
2302 return numeric_array{_mm256_div_pd(lhs.reg(), rhs.reg())};
2303 } else if constexpr (is_f32x8) {
2304 return numeric_array{_mm256_div_ps(lhs.reg(), rhs.reg())};
2305 }
2306#endif
2307#if defined(HI_HAS_SSE2)
2308 if constexpr (is_f64x2) {
2309 return numeric_array{_mm_div_pd(lhs.reg(), rhs.reg())};
2310 }
2311#endif
2312#if defined(HI_HAS_SSE)
2313 if constexpr (is_f32x4) {
2314 return numeric_array{_mm_div_ps(lhs.reg(), rhs.reg())};
2315 }
2316#endif
2317 }
2318
2319 auto r = numeric_array{};
2320 for (std::size_t i = 0; i != N; ++i) {
2321 r.v[i] = lhs.v[i] / rhs.v[i];
2322 }
2323 return r;
2324 }
2325
2326 [[nodiscard]] friend constexpr numeric_array operator/(numeric_array const& lhs, T const& rhs) noexcept
2327 {
2328 return lhs / broadcast(rhs);
2329 }
2330
2331 [[nodiscard]] friend constexpr numeric_array operator/(T const& lhs, numeric_array const& rhs) noexcept
2332 {
2333 return broadcast(lhs) / rhs;
2334 }
2335
2336 [[nodiscard]] friend constexpr numeric_array operator%(numeric_array const& lhs, numeric_array const& rhs) noexcept
2337 {
2338 hilet div_result = floor(lhs / rhs);
2339 return lhs - (div_result * rhs);
2340 }
2341
2342 [[nodiscard]] friend constexpr numeric_array operator%(numeric_array const& lhs, T const& rhs) noexcept
2343 {
2344 return lhs % broadcast(rhs);
2345 }
2346
2347 [[nodiscard]] friend constexpr numeric_array operator%(T const& lhs, numeric_array const& rhs) noexcept
2348 {
2349 return broadcast(lhs) % rhs;
2350 }
2351
2352 [[nodiscard]] friend constexpr numeric_array min(numeric_array const& lhs, numeric_array const& rhs) noexcept
2353 {
2354 if (not std::is_constant_evaluated()) {
2355#if defined(HI_HAS_AVX2)
2356 if constexpr (is_i32x8) {
2357 return numeric_array{_mm256_min_epi32(lhs.reg(), rhs.reg())};
2358 } else if constexpr (is_u32x8) {
2359 return numeric_array{_mm256_min_epu32(lhs.reg(), rhs.reg())};
2360 } else if constexpr (is_i16x16) {
2361 return numeric_array{_mm256_min_epi16(lhs.reg(), rhs.reg())};
2362 } else if constexpr (is_u16x16) {
2363 return numeric_array{_mm256_min_epu16(lhs.reg(), rhs.reg())};
2364 } else if constexpr (is_i8x32) {
2365 return numeric_array{_mm256_min_epi8(lhs.reg(), rhs.reg())};
2366 } else if constexpr (is_u8x32) {
2367 return numeric_array{_mm256_min_epu8(lhs.reg(), rhs.reg())};
2368 }
2369#endif
2370#if defined(HI_HAS_AVX)
2371 if constexpr (is_f64x4) {
2372 return numeric_array{_mm256_min_pd(lhs.reg(), rhs.reg())};
2373 } else if constexpr (is_f32x8) {
2374 return numeric_array{_mm256_min_ps(lhs.reg(), rhs.reg())};
2375 }
2376#endif
2377#if defined(HI_HAS_SSE4_1)
2378 if constexpr (is_i32x4) {
2379 return numeric_array{_mm_min_epi32(lhs.reg(), rhs.reg())};
2380 } else if constexpr (is_u32x4) {
2381 return numeric_array{_mm_min_epu32(lhs.reg(), rhs.reg())};
2382 } else if constexpr (is_u16x8) {
2383 return numeric_array{_mm_min_epu16(lhs.reg(), rhs.reg())};
2384 } else if constexpr (is_i8x16) {
2385 return numeric_array{_mm_min_epi8(lhs.reg(), rhs.reg())};
2386 }
2387#endif
2388#if defined(HI_HAS_SSE2)
2389 if constexpr (is_f64x2) {
2390 return numeric_array{_mm_min_pd(lhs.reg(), rhs.reg())};
2391 } else if constexpr (is_i16x8) {
2392 return numeric_array{_mm_min_epi16(lhs.reg(), rhs.reg())};
2393 } else if constexpr (is_u8x16) {
2394 return numeric_array{_mm_min_epu8(lhs.reg(), rhs.reg())};
2395 }
2396#endif
2397#if defined(HI_HAS_SSE)
2398 if constexpr (is_f32x4) {
2399 return numeric_array{_mm_min_ps(lhs.reg(), rhs.reg())};
2400 }
2401#endif
2402 }
2403
2404 auto r = numeric_array{};
2405 for (std::size_t i = 0; i != N; ++i) {
2406 r.v[i] = std::min(lhs.v[i], rhs.v[i]);
2407 }
2408 return r;
2409 }
2410
2411 [[nodiscard]] friend constexpr numeric_array max(numeric_array const& lhs, numeric_array const& rhs) noexcept
2412 {
2413 if (not std::is_constant_evaluated()) {
2414#if defined(HI_HAS_AVX2)
2415 if constexpr (is_i32x8) {
2416 return numeric_array{_mm256_max_epi32(lhs.reg(), rhs.reg())};
2417 } else if constexpr (is_u32x8) {
2418 return numeric_array{_mm256_max_epu32(lhs.reg(), rhs.reg())};
2419 } else if constexpr (is_i16x16) {
2420 return numeric_array{_mm256_max_epi16(lhs.reg(), rhs.reg())};
2421 } else if constexpr (is_u16x16) {
2422 return numeric_array{_mm256_max_epu16(lhs.reg(), rhs.reg())};
2423 } else if constexpr (is_i8x32) {
2424 return numeric_array{_mm256_max_epi8(lhs.reg(), rhs.reg())};
2425 } else if constexpr (is_u8x32) {
2426 return numeric_array{_mm256_max_epu8(lhs.reg(), rhs.reg())};
2427 }
2428#endif
2429#if defined(HI_HAS_AVX)
2430 if constexpr (is_f64x4) {
2431 return numeric_array{_mm256_max_pd(lhs.reg(), rhs.reg())};
2432 } else if constexpr (is_f32x8) {
2433 return numeric_array{_mm256_max_ps(lhs.reg(), rhs.reg())};
2434 }
2435#endif
2436#if defined(HI_HAS_SSE4_1)
2437 if constexpr (is_i32x4) {
2438 return numeric_array{_mm_max_epi32(lhs.reg(), rhs.reg())};
2439 } else if constexpr (is_u32x4) {
2440 return numeric_array{_mm_max_epu32(lhs.reg(), rhs.reg())};
2441 } else if constexpr (is_u16x8) {
2442 return numeric_array{_mm_max_epu16(lhs.reg(), rhs.reg())};
2443 } else if constexpr (is_i8x16) {
2444 return numeric_array{_mm_max_epi8(lhs.reg(), rhs.reg())};
2445 }
2446#endif
2447#if defined(HI_HAS_SSE2)
2448 if constexpr (is_f64x2) {
2449 return numeric_array{_mm_max_pd(lhs.reg(), rhs.reg())};
2450 } else if constexpr (is_i16x8) {
2451 return numeric_array{_mm_max_epi16(lhs.reg(), rhs.reg())};
2452 } else if constexpr (is_u8x16) {
2453 return numeric_array{_mm_max_epu8(lhs.reg(), rhs.reg())};
2454 }
2455#endif
2456#if defined(HI_HAS_SSE)
2457 if constexpr (is_f32x4) {
2458 return numeric_array{_mm_max_ps(lhs.reg(), rhs.reg())};
2459 }
2460#endif
2461 }
2462
2463 auto r = numeric_array{};
2464 for (std::size_t i = 0; i != N; ++i) {
2465 r.v[i] = std::max(lhs.v[i], rhs.v[i]);
2466 }
2467 return r;
2468 }
2469
2470 [[nodiscard]] friend constexpr numeric_array
2471 clamp(numeric_array const& lhs, numeric_array const& low, numeric_array const& high) noexcept
2472 {
2473 return min(max(lhs, low), high);
2474 }
2475
2476 [[nodiscard]] friend constexpr numeric_array hadd(numeric_array const& lhs, numeric_array const& rhs) noexcept
2477 {
2478 if (not std::is_constant_evaluated()) {
2479#if defined(HI_HAS_AVX2)
2480 if constexpr (is_i32x8 or is_u32x8) {
2481 return numeric_array{_mm256_hadd_epi32(lhs.reg(), rhs.reg())};
2482 } else if constexpr (is_i16x16 or is_u16x16) {
2483 return numeric_array{_mm256_hadd_epi16(lhs.reg(), rhs.reg())};
2484 }
2485#endif
2486#if defined(HI_HAS_AVX)
2487 if constexpr (is_f64x4) {
2488 return numeric_array{_mm256_hadd_pd(lhs.reg(), rhs.reg())};
2489 } else if constexpr (is_f32x8) {
2490 return numeric_array{_mm256_hadd_ps(lhs.reg(), rhs.reg())};
2491 }
2492#endif
2493#if defined(HI_HAS_SSSE3)
2494 if constexpr (is_i32x4 or is_u32x4) {
2495 return numeric_array{_mm_hadd_epi32(lhs.reg(), rhs.reg())};
2496 } else if constexpr (is_i16x8 or is_u16x8) {
2497 return numeric_array{_mm_hadd_epi16(lhs.reg(), rhs.reg())};
2498 }
2499#endif
2500#if defined(HI_HAS_SSE3)
2501 if constexpr (is_f64x2) {
2502 return numeric_array{_mm_hadd_pd(lhs.reg(), rhs.reg())};
2503 } else if constexpr (is_f32x4) {
2504 return numeric_array{_mm_hadd_ps(lhs.reg(), rhs.reg())};
2505 }
2506#endif
2507 }
2508
2509 hi_axiom(N % 2 == 0);
2510
2511 auto r = numeric_array{};
2512
2513 std::size_t src_i = 0;
2514 std::size_t dst_i = 0;
2515 while (src_i != N) {
2516 auto tmp = lhs[src_i++];
2517 tmp += lhs[src_i++];
2518 r.v[dst_i++] = tmp;
2519 }
2520
2521 src_i = 0;
2522 while (src_i != N) {
2523 auto tmp = rhs[src_i++];
2524 tmp += rhs[src_i++];
2525 r.v[dst_i++] = tmp;
2526 }
2527 return r;
2528 }
2529
2530 [[nodiscard]] friend constexpr numeric_array hsub(numeric_array const& lhs, numeric_array const& rhs) noexcept
2531 {
2532 if (not std::is_constant_evaluated()) {
2533#if defined(HI_HAS_AVX2)
2534 if constexpr (is_i32x8 or is_u32x8) {
2535 return numeric_array{_mm256_hsub_epi32(lhs.reg(), rhs.reg())};
2536 } else if constexpr (is_i16x16 or is_u16x16) {
2537 return numeric_array{_mm256_hsub_epi16(lhs.reg(), rhs.reg())};
2538 }
2539#endif
2540#if defined(HI_HAS_AVX)
2541 if constexpr (is_f64x4) {
2542 return numeric_array{_mm256_hsub_pd(lhs.reg(), rhs.reg())};
2543 } else if constexpr (is_f32x8) {
2544 return numeric_array{_mm256_hsub_ps(lhs.reg(), rhs.reg())};
2545 }
2546#endif
2547#if defined(HI_HAS_SSSE3)
2548 if constexpr (is_i32x4 or is_u32x4) {
2549 return numeric_array{_mm_hsub_epi32(lhs.reg(), rhs.reg())};
2550 } else if constexpr (is_i16x8 or is_u16x8) {
2551 return numeric_array{_mm_hsub_epi16(lhs.reg(), rhs.reg())};
2552 }
2553#endif
2554#if defined(HI_HAS_SSE3)
2555 if constexpr (is_f64x2) {
2556 return numeric_array{_mm_hsub_pd(lhs.reg(), rhs.reg())};
2557 } else if constexpr (is_f32x4) {
2558 return numeric_array{_mm_hsub_ps(lhs.reg(), rhs.reg())};
2559 }
2560#endif
2561 }
2562
2563 hi_axiom(N % 2 == 0);
2564
2565 auto r = numeric_array{};
2566
2567 std::size_t src_i = 0;
2568 std::size_t dst_i = 0;
2569 while (src_i != N) {
2570 auto tmp = lhs[src_i++];
2571 tmp -= lhs[src_i++];
2572 r.v[dst_i++] = tmp;
2573 }
2574
2575 src_i = 0;
2576 while (src_i != N) {
2577 auto tmp = rhs[src_i++];
2578 tmp -= rhs[src_i++];
2579 r.v[dst_i++] = tmp;
2580 }
2581 return r;
2582 }
2583
2588 template<std::size_t Mask>
2589 [[nodiscard]] friend constexpr numeric_array addsub(numeric_array const& lhs, numeric_array const& rhs) noexcept
2590 {
2591 constexpr std::size_t not_mask = (1 << N) - 1;
2592 return lhs + neg<Mask ^ not_mask>(rhs);
2593 }
2594
2597 [[nodiscard]] friend constexpr numeric_array cross_2D(numeric_array const& rhs) noexcept requires(N >= 2)
2598 {
2599 hi_axiom(rhs.z() == 0.0f && rhs.is_vector());
2600 return numeric_array{-rhs.y(), rhs.x()};
2601 }
2602
2605 [[nodiscard]] friend constexpr numeric_array normal_2D(numeric_array const& rhs) noexcept requires(N >= 2)
2606 {
2607 return normalize<0b0011>(cross_2D(rhs));
2608 }
2609
2613 [[nodiscard]] friend constexpr float cross_2D(numeric_array const& lhs, numeric_array const& rhs) noexcept requires(N >= 2)
2614 {
2615 hilet tmp1 = rhs.yxwz();
2616 hilet tmp2 = lhs * tmp1;
2617 hilet tmp3 = hsub(tmp2, tmp2);
2618 return get<0>(tmp3);
2619 }
2620
2621 // x=a.y*b.z - a.z*b.y
2622 // y=a.z*b.x - a.x*b.z
2623 // z=a.x*b.y - a.y*b.x
2624 // w=a.w*b.w - a.w*b.w
2625 [[nodiscard]] constexpr friend numeric_array cross_3D(numeric_array const& lhs, numeric_array const& rhs) noexcept
2626 requires(N == 4)
2627 {
2628 hilet a_left = lhs.yzxw();
2629 hilet b_left = rhs.zxyw();
2630 hilet left = a_left * b_left;
2631
2632 hilet a_right = lhs.zxyw();
2633 hilet b_right = rhs.yzxw();
2634 hilet right = a_right * b_right;
2635 return left - right;
2636 }
2637
2638 [[nodiscard]] static constexpr numeric_array byte_srl_shuffle_indices(unsigned int rhs) requires(is_i8x16)
2639 {
2640 static_assert(std::endian::native == std::endian::little);
2641
2642 auto r = numeric_array{};
2643 for (auto i = 0; i != 16; ++i) {
2644 if ((i + rhs) < 16) {
2645 r[i] = narrow_cast<int8_t>(i + rhs);
2646 } else {
2647 // Indices set to -1 result in a zero after a byte shuffle.
2648 r[i] = -1;
2649 }
2650 }
2651 return r;
2652 }
2653
2654 [[nodiscard]] static constexpr numeric_array byte_sll_shuffle_indices(unsigned int rhs) requires(is_i8x16)
2655 {
2656 static_assert(std::endian::native == std::endian::little);
2657
2658 auto r = numeric_array{};
2659 for (auto i = 0; i != 16; ++i) {
2660 if ((i - rhs) >= 0) {
2661 r[i] = narrow_cast<int8_t>(i - rhs);
2662 } else {
2663 // Indices set to -1 result in a zero after a byte shuffle.
2664 r[i] = -1;
2665 }
2666 }
2667 return r;
2668 }
2669
2672 [[nodiscard]] friend constexpr numeric_array shuffle(numeric_array const& lhs, numeric_array const& rhs) noexcept
2673 requires(std::is_integral_v<value_type>)
2674 {
2675 if (!std::is_constant_evaluated()) {
2676#if defined(HI_HAS_SSSE3)
2677 if constexpr (is_i8x16 or is_u8x16) {
2678 return numeric_array{_mm_shuffle_epi8(lhs.reg(), rhs.reg())};
2679 }
2680#endif
2681 }
2682
2683 auto r = numeric_array{};
2684 for (std::size_t i = 0; i != N; ++i) {
2685 if (rhs[i] >= 0) {
2686 r[i] = lhs[rhs[i] & 0xf];
2687 } else {
2688 r[i] = 0;
2689 }
2690 }
2691
2692 return r;
2693 }
2694
2697 [[nodiscard]] friend constexpr numeric_array midpoint(numeric_array const& p1, numeric_array const& p2) noexcept
2698 {
2699 hi_axiom(p1.is_point());
2700 hi_axiom(p2.is_point());
2701 return (p1 + p2) * 0.5f;
2702 }
2703
2706 [[nodiscard]] friend constexpr numeric_array reflect_point(numeric_array const& p, numeric_array const anchor) noexcept
2707 {
2708 hi_axiom(p.is_point());
2709 hi_axiom(anchor.is_point());
2710 return anchor - (p - anchor);
2711 }
2712
2713 hi_warning_push();
2714 // C26494 Variable '...' is uninitialized. Always initialize an object (type.5).
2715 // Internal to _MM_TRANSPOSE4_PS
2716 hi_warning_ignore_msvc(26494);
2717 template<typename... Columns>
2718 [[nodiscard]] friend constexpr std::array<numeric_array, N> transpose(Columns const&...columns) noexcept
2719 {
2720 static_assert(sizeof...(Columns) == N, "Can only transpose square matrices");
2721
2722 if (not std::is_constant_evaluated()) {
2723#if defined(HI_HAS_SSE)
2724 if constexpr (is_f32x4 and sizeof...(Columns) == 4) {
2725 auto tmp = std::array<__m128, N>{columns.reg()...};
2726 _MM_TRANSPOSE4_PS(std::get<0>(tmp), std::get<1>(tmp), std::get<2>(tmp), std::get<3>(tmp));
2727 return {
2728 numeric_array{get<0>(tmp)},
2729 numeric_array{get<1>(tmp)},
2730 numeric_array{get<2>(tmp)},
2731 numeric_array{get<3>(tmp)}};
2732#endif
2733 }
2734 }
2735
2737 auto f = [&r, &columns... ]<std::size_t... Ints>(std::index_sequence<Ints...>)
2738 {
2739 auto tf = [&r](auto i, auto v) {
2740 for (std::size_t j = 0; j != N; ++j) {
2741 r[j][i] = v[j];
2742 }
2743 return 0;
2744 };
2745 static_cast<void>((tf(Ints, columns) + ...));
2746 };
2747 f(std::make_index_sequence<sizeof...(columns)>{});
2748 return r;
2749 }
2750 hi_warning_pop();
2751
2752 [[nodiscard]] constexpr friend numeric_array composit(numeric_array const& under, numeric_array const& over) noexcept
2753 requires(N == 4 && std::is_floating_point_v<T>)
2754 {
2755 if (over.is_transparent()) {
2756 return under;
2757 }
2758 if (over.is_opaque()) {
2759 return over;
2760 }
2761
2762 hilet over_alpha = over.wwww();
2763 hilet under_alpha = under.wwww();
2764
2765 hilet over_color = over.xyz1();
2766 hilet under_color = under.xyz1();
2767
2768 hilet output_color = over_color * over_alpha + under_color * under_alpha * (T{1} - over_alpha);
2769
2770 return output_color / output_color.www1();
2771 }
2772
2773 [[nodiscard]] constexpr friend numeric_array composit(numeric_array const& under, numeric_array const& over) noexcept
2774 requires(is_f16x4)
2775 {
2776 return numeric_array{composit(static_cast<numeric_array<float, 4>>(under), static_cast<numeric_array<float, 4>>(over))};
2777 }
2778
2779 [[nodiscard]] friend std::string to_string(numeric_array const& rhs) noexcept
2780 {
2781 auto r = std::string{};
2782
2783 r += '(';
2784 for (std::size_t i = 0; i != N; ++i) {
2785 if (i != 0) {
2786 r += "; ";
2787 }
2788 r += std::format("{}", rhs[i]);
2789 }
2790 r += ')';
2791 return r;
2792 }
2793
2794 friend std::ostream& operator<<(std::ostream& lhs, numeric_array const& rhs)
2795 {
2796 return lhs << to_string(rhs);
2797 }
2798
2803 template<std::size_t FromElement, std::size_t ToElement>
2804 [[nodiscard]] constexpr friend numeric_array insert(numeric_array const& lhs, numeric_array const& rhs)
2805 {
2806 auto r = numeric_array{};
2807
2808 if (!std::is_constant_evaluated()) {
2809#if defined(HI_HAS_SSE4_1)
2810 if constexpr (is_f32x4) {
2811 constexpr uint8_t insert_mask = static_cast<uint8_t>((FromElement << 6) | (ToElement << 4));
2812 return numeric_array{_mm_insert_ps(lhs.reg(), rhs.reg(), insert_mask)};
2813
2814 } else if constexpr (is_i32x4 or is_u32x4) {
2815 constexpr uint8_t insert_mask = static_cast<uint8_t>((FromElement << 6) | (ToElement << 4));
2816 return numeric_array{
2817 _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg()), insert_mask))};
2818 }
2819#endif
2820#if defined(HI_HAS_SSE2)
2821 if constexpr (is_f64x2) {
2822 if constexpr (FromElement == 0 and ToElement == 0) {
2823 return numeric_array{_mm_shuffle_pd(rhs.reg(), lhs.reg(), 0b10)};
2824 } else if constexpr (FromElement == 1 and ToElement == 0) {
2825 return numeric_array{_mm_shuffle_pd(rhs.reg(), lhs.reg(), 0b11)};
2826 } else if constexpr (FromElement == 0 and ToElement == 1) {
2827 return numeric_array{_mm_shuffle_pd(lhs.reg(), rhs.reg(), 0b00)};
2828 } else {
2829 return numeric_array{_mm_shuffle_pd(lhs.reg(), rhs.reg(), 0b10)};
2830 }
2831
2832 } else if constexpr (is_i64x2 or is_u64x2) {
2833 hilet lhs_ = _mm_castsi128_pd(lhs.reg());
2834 hilet rhs_ = _mm_castsi128_pd(rhs.reg());
2835
2836 if constexpr (FromElement == 0 and ToElement == 0) {
2837 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(rhs_, lhs_, 0b10))};
2838 } else if constexpr (FromElement == 1 and ToElement == 0) {
2839 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(rhs_, lhs_, 0b11))};
2840 } else if constexpr (FromElement == 0 and ToElement == 1) {
2841 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(lhs_, rhs_, 0b00))};
2842 } else {
2843 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(lhs_, rhs_, 0b10))};
2844 }
2845 }
2846#endif
2847 }
2848
2849 for (std::size_t i = 0; i != N; ++i) {
2850 r[i] = (i == ToElement) ? rhs[FromElement] : lhs[i];
2851 }
2852
2853 return r;
2854 }
2855
2863 template<ssize_t... Elements>
2864 [[nodiscard]] constexpr numeric_array swizzle() const
2865 {
2866 static_assert(sizeof...(Elements) <= N);
2867
2868 if (!std::is_constant_evaluated()) {
2869#if defined(HI_HAS_AVX)
2870 if constexpr (is_f64x2) {
2871 return numeric_array{_mm_swizzle_pd<Elements...>(reg())};
2872 } else if constexpr (is_f32x4) {
2873 return numeric_array{_mm_swizzle_ps<Elements...>(reg())};
2874 } else if constexpr (is_i64x2 or is_u64x2) {
2875 return numeric_array{_mm_swizzle_epi64<Elements...>(reg())};
2876 } else if constexpr (is_i32x4 or is_u32x4) {
2877 return numeric_array{_mm_swizzle_epi32<Elements...>(reg())};
2878 }
2879#endif
2880 }
2881
2882 auto r = numeric_array{};
2883 swizzle_detail<0, Elements...>(r);
2884 return r;
2885 }
2886
2887#define SWIZZLE(swizzle_name, D, ...) \
2888 [[nodiscard]] constexpr numeric_array swizzle_name() const noexcept requires(D == N) \
2889 { \
2890 return swizzle<__VA_ARGS__>(); \
2891 }
2892
2893#define SWIZZLE_4D_GEN1(name, ...) \
2894 SWIZZLE(name##0, 4, __VA_ARGS__, get_zero) \
2895 SWIZZLE(name##1, 4, __VA_ARGS__, get_one) \
2896 SWIZZLE(name##x, 4, __VA_ARGS__, 0) \
2897 SWIZZLE(name##y, 4, __VA_ARGS__, 1) \
2898 SWIZZLE(name##z, 4, __VA_ARGS__, 2) \
2899 SWIZZLE(name##w, 4, __VA_ARGS__, 3)
2900
2901#define SWIZZLE_4D_GEN2(name, ...) \
2902 SWIZZLE_4D_GEN1(name##0, __VA_ARGS__, get_zero) \
2903 SWIZZLE_4D_GEN1(name##1, __VA_ARGS__, get_one) \
2904 SWIZZLE_4D_GEN1(name##x, __VA_ARGS__, 0) \
2905 SWIZZLE_4D_GEN1(name##y, __VA_ARGS__, 1) \
2906 SWIZZLE_4D_GEN1(name##z, __VA_ARGS__, 2) \
2907 SWIZZLE_4D_GEN1(name##w, __VA_ARGS__, 3)
2908
2909#define SWIZZLE_4D_GEN3(name, ...) \
2910 SWIZZLE_4D_GEN2(name##0, __VA_ARGS__, get_zero) \
2911 SWIZZLE_4D_GEN2(name##1, __VA_ARGS__, get_one) \
2912 SWIZZLE_4D_GEN2(name##x, __VA_ARGS__, 0) \
2913 SWIZZLE_4D_GEN2(name##y, __VA_ARGS__, 1) \
2914 SWIZZLE_4D_GEN2(name##z, __VA_ARGS__, 2) \
2915 SWIZZLE_4D_GEN2(name##w, __VA_ARGS__, 3)
2916
2917 SWIZZLE_4D_GEN3(_0, get_zero)
2918 SWIZZLE_4D_GEN3(_1, get_one)
2919 SWIZZLE_4D_GEN3(x, 0)
2920 SWIZZLE_4D_GEN3(y, 1)
2921 SWIZZLE_4D_GEN3(z, 2)
2922 SWIZZLE_4D_GEN3(w, 3)
2923
2924#define SWIZZLE_3D_GEN1(name, ...) \
2925 SWIZZLE(name##0, 3, __VA_ARGS__, get_zero) \
2926 SWIZZLE(name##1, 3, __VA_ARGS__, get_one) \
2927 SWIZZLE(name##x, 3, __VA_ARGS__, 0) \
2928 SWIZZLE(name##y, 3, __VA_ARGS__, 1) \
2929 SWIZZLE(name##z, 3, __VA_ARGS__, 2)
2930
2931#define SWIZZLE_3D_GEN2(name, ...) \
2932 SWIZZLE_3D_GEN1(name##0, __VA_ARGS__, get_zero) \
2933 SWIZZLE_3D_GEN1(name##1, __VA_ARGS__, get_one) \
2934 SWIZZLE_3D_GEN1(name##x, __VA_ARGS__, 0) \
2935 SWIZZLE_3D_GEN1(name##y, __VA_ARGS__, 1) \
2936 SWIZZLE_3D_GEN1(name##z, __VA_ARGS__, 2)
2937
2938 SWIZZLE_3D_GEN2(_0, get_zero)
2939 SWIZZLE_3D_GEN2(_1, get_one)
2940 SWIZZLE_3D_GEN2(x, 0)
2941 SWIZZLE_3D_GEN2(y, 1)
2942 SWIZZLE_3D_GEN2(z, 2)
2943
2944#define SWIZZLE_2D_GEN1(name, ...) \
2945 SWIZZLE(name##0, 2, __VA_ARGS__, get_zero) \
2946 SWIZZLE(name##1, 2, __VA_ARGS__, get_one) \
2947 SWIZZLE(name##x, 2, __VA_ARGS__, 0) \
2948 SWIZZLE(name##y, 2, __VA_ARGS__, 1)
2949
2950 SWIZZLE_2D_GEN1(_0, get_zero)
2951 SWIZZLE_2D_GEN1(_1, get_one)
2952 SWIZZLE_2D_GEN1(x, 0)
2953 SWIZZLE_2D_GEN1(y, 1)
2954
2955#undef SWIZZLE
2956#undef SWIZZLE_4D_GEN1
2957#undef SWIZZLE_4D_GEN2
2958#undef SWIZZLE_4D_GEN3
2959#undef SWIZZLE_3D_GEN1
2960#undef SWIZZLE_3D_GEN2
2961#undef SWIZZLE_2D_GEN1
2962
2963 template<ssize_t I, ssize_t FirstElement, ssize_t... RestElements>
2964 constexpr void swizzle_detail(numeric_array& r) const noexcept
2965 {
2966 static_assert(I < narrow_cast<ssize_t>(N));
2967 static_assert(FirstElement >= -2 && FirstElement < narrow_cast<ssize_t>(N), "Index out of bounds");
2968
2969 get<I>(r) = get<FirstElement>(*this);
2970 if constexpr (sizeof...(RestElements) != 0) {
2971 swizzle_detail<I + 1, RestElements...>(r);
2972 }
2973 }
2974};
2975
2976using i8x1 = numeric_array<int8_t, 1>;
2977using i8x2 = numeric_array<int8_t, 2>;
2978using i8x4 = numeric_array<int8_t, 4>;
2979using i8x8 = numeric_array<int8_t, 8>;
2980using i8x16 = numeric_array<int8_t, 16>;
2981using i8x32 = numeric_array<int8_t, 32>;
2982using i8x64 = numeric_array<int8_t, 64>;
2983
2984using u8x1 = numeric_array<uint8_t, 1>;
2985using u8x2 = numeric_array<uint8_t, 2>;
2986using u8x4 = numeric_array<uint8_t, 4>;
2987using u8x8 = numeric_array<uint8_t, 8>;
2988using u8x16 = numeric_array<uint8_t, 16>;
2989using u8x32 = numeric_array<uint8_t, 32>;
2990using u8x64 = numeric_array<uint8_t, 64>;
2991
2992using i16x1 = numeric_array<int16_t, 1>;
2993using i16x2 = numeric_array<int16_t, 2>;
2994using i16x4 = numeric_array<int16_t, 4>;
2995using i16x8 = numeric_array<int16_t, 8>;
2996using i16x16 = numeric_array<int16_t, 16>;
2997using i16x32 = numeric_array<int16_t, 32>;
2998
2999using u16x1 = numeric_array<uint16_t, 1>;
3000using u16x2 = numeric_array<uint16_t, 2>;
3001using u16x4 = numeric_array<uint16_t, 4>;
3002using u16x8 = numeric_array<uint16_t, 8>;
3003using u16x16 = numeric_array<uint16_t, 16>;
3004using u16x32 = numeric_array<uint16_t, 32>;
3005
3006using f16x4 = numeric_array<float16, 4>;
3007
3008using i32x1 = numeric_array<int32_t, 1>;
3009using i32x2 = numeric_array<int32_t, 2>;
3010using i32x4 = numeric_array<int32_t, 4>;
3011using i32x8 = numeric_array<int32_t, 8>;
3012using i32x16 = numeric_array<int32_t, 16>;
3013
3014using u32x1 = numeric_array<uint32_t, 1>;
3015using u32x2 = numeric_array<uint32_t, 2>;
3016using u32x4 = numeric_array<uint32_t, 4>;
3017using u32x8 = numeric_array<uint32_t, 8>;
3018using u32x16 = numeric_array<uint32_t, 16>;
3019
3020using f32x1 = numeric_array<float, 1>;
3021using f32x2 = numeric_array<float, 2>;
3022using f32x4 = numeric_array<float, 4>;
3023using f32x8 = numeric_array<float, 8>;
3024using f32x16 = numeric_array<float, 16>;
3025
3026using i64x1 = numeric_array<int64_t, 1>;
3027using i64x2 = numeric_array<int64_t, 2>;
3028using i64x4 = numeric_array<int64_t, 4>;
3029using i64x8 = numeric_array<int64_t, 8>;
3030
3031using u64x1 = numeric_array<uint64_t, 1>;
3032using u64x2 = numeric_array<uint64_t, 2>;
3033using u64x4 = numeric_array<uint64_t, 4>;
3034using u64x8 = numeric_array<uint64_t, 8>;
3035
3036using f64x1 = numeric_array<double, 1>;
3037using f64x2 = numeric_array<double, 2>;
3038using f64x4 = numeric_array<double, 4>;
3039using f64x8 = numeric_array<double, 8>;
3040
3041} // namespace hi::inline v1
3042
3043template<class T, std::size_t N>
3044struct std::tuple_size<hi::numeric_array<T, N>> : std::integral_constant<std::size_t, N> {
3045};
3046
3047template<std::size_t I, class T, std::size_t N>
3048struct std::tuple_element<I, hi::numeric_array<T, N>> {
3049 using type = T;
3050};
3051
3052hi_warning_pop();
std::ptrdiff_t ssize_t
Signed size/index into an array.
Definition required.hpp:162
#define hilet
Invariant should be the default for variables.
Definition required.hpp:23
Functions and macros for handling architectural difference between compilers, CPUs and operating syst...
constexpr alignment operator|(horizontal_alignment lhs, vertical_alignment rhs) noexcept
Combine vertical and horizontal alignment.
Definition alignment.hpp:200
STL namespace.
Definition numeric_array.hpp:59
friend constexpr T get(numeric_array &&rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:1037
hi_force_inline friend constexpr T squared_hypot(numeric_array const &rhs) noexcept
Take the squared length of the vector.
Definition numeric_array.hpp:1540
constexpr void store(std::byte *ptr) const noexcept
Store a numeric array into memory.
Definition numeric_array.hpp:689
hi_force_inline friend constexpr T dot(numeric_array const &lhs, numeric_array const &rhs) noexcept
Take a dot product.
Definition numeric_array.hpp:1498
friend constexpr numeric_array neg(numeric_array rhs) noexcept
Negate individual elements.
Definition numeric_array.hpp:1301
friend constexpr T get(numeric_array const &rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:1147
friend constexpr numeric_array cross_2D(numeric_array const &rhs) noexcept
Calculate the 2D normal on a 2D vector.
Definition numeric_array.hpp:2597
friend constexpr numeric_array reflect_point(numeric_array const &p, numeric_array const anchor) noexcept
Find the point on the other side and at the same distance of an anchor-point.
Definition numeric_array.hpp:2706
friend constexpr numeric_array midpoint(numeric_array const &p1, numeric_array const &p2) noexcept
Find a point at the midpoint between two points.
Definition numeric_array.hpp:2697
static constexpr numeric_array load(std::byte const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:662
friend constexpr numeric_array rotl(numeric_array const &lhs, unsigned int rhs) noexcept
Rotate left.
Definition numeric_array.hpp:1938
friend constexpr T & get(numeric_array &rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:1025
friend constexpr numeric_array shuffle(numeric_array const &lhs, numeric_array const &rhs) noexcept
Shuffle a 16x byte array, using the indices from the right-hand-side.
Definition numeric_array.hpp:2672
friend constexpr numeric_array rotr(numeric_array const &lhs, unsigned int rhs) noexcept
Rotate right.
Definition numeric_array.hpp:1951
friend constexpr numeric_array blend(numeric_array const &a, numeric_array const &b, numeric_array const &mask)
Blend the values using a dynamic mask.
Definition numeric_array.hpp:1251
constexpr friend T extract(numeric_array const &rhs) noexcept
Extract an element from the array.
Definition numeric_array.hpp:1057
constexpr friend numeric_array insert(numeric_array const &lhs, T rhs) noexcept
Insert a value in the array.
Definition numeric_array.hpp:1112
friend constexpr T rcp_hypot(numeric_array const &rhs) noexcept
Take a reciprocal of the length.
Definition numeric_array.hpp:1552
static constexpr numeric_array load(T const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:673
friend constexpr T hypot(numeric_array const &rhs) noexcept
Take the length of the vector.
Definition numeric_array.hpp:1527
friend constexpr float cross_2D(numeric_array const &lhs, numeric_array const &rhs) noexcept
Calculate the cross-product between two 2D vectors.
Definition numeric_array.hpp:2613
friend constexpr numeric_array normal_2D(numeric_array const &rhs) noexcept
Calculate the 2D unit-normal on a 2D vector.
Definition numeric_array.hpp:2605
static constexpr numeric_array interleave_lo(numeric_array a, numeric_array b) noexcept
Interleave the first words in both arrays.
Definition numeric_array.hpp:616
friend constexpr numeric_array addsub(numeric_array const &lhs, numeric_array const &rhs) noexcept
Add or subtract individual elements.
Definition numeric_array.hpp:2589
static constexpr numeric_array load(std::byte const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:651
friend constexpr numeric_array normalize(numeric_array const &rhs) noexcept
Normalize a vector.
Definition numeric_array.hpp:1574
constexpr friend numeric_array insert(numeric_array const &lhs, numeric_array const &rhs)
Insert an element from rhs into the result.
Definition numeric_array.hpp:2804
friend constexpr numeric_array blend(numeric_array const &lhs, numeric_array const &rhs) noexcept
Blend two numeric arrays.
Definition numeric_array.hpp:1197
constexpr numeric_array swizzle() const
swizzle around the elements of the numeric array.
Definition numeric_array.hpp:2864
friend constexpr numeric_array zero(numeric_array rhs) noexcept
Set individual elements to zero.
Definition numeric_array.hpp:1165
Definition concepts.hpp:36
Definition concepts.hpp:39
T back(T... args)
T begin(T... args)
T ceil(T... args)
T data(T... args)
T empty(T... args)
T end(T... args)
T floor(T... args)
T front(T... args)
T max(T... args)
T max_size(T... args)
T memcpy(T... args)
T min(T... args)
T round(T... args)
T size(T... args)
T sqrt(T... args)
T to_string(T... args)