HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
numeric_array.hpp
1// Copyright Take Vos 2020-2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../architecture.hpp"
8#include "../concepts.hpp"
9#include "../cast.hpp"
10#include "../type_traits.hpp"
11#include "../float16.hpp"
12
13#if defined(HI_HAS_AVX)
14#include "swizzle_avx.hpp"
15#include <immintrin.h> // AVX, AVX2, FMA
16#endif
17#if defined(HI_HAS_SSE4_2)
18#include <nmmintrin.h> // SSE4.2
19#endif
20#if defined(HI_HAS_SSE4_1)
21#include "float16_sse4_1.hpp"
22#include <smmintrin.h> // SSE4.1
23#include <ammintrin.h> // SSE4A
24#endif
25#if defined(HI_HAS_SSSE3)
26#include <tmmintrin.h> // SSSE3
27#endif
28#if defined(HI_HAS_SSE3)
29#include <pmmintrin.h> // SSE3
30#endif
31#if defined(HI_HAS_SSE2)
32#include <emmintrin.h> // SSE2
33#endif
34#if defined(HI_HAS_SSE)
35#include <xmmintrin.h> // SSE
36#endif
37
38#include <cstdint>
39#include <ostream>
40#include <string>
41#include <array>
42#include <type_traits>
43#include <concepts>
44#include <bit>
45#include <climits>
46
47hi_warning_push();
48// C4702 unreachable code: Suppressed due intrinsics and std::is_constant_evaluated()
49hi_msvc_suppress(4702);
50
51namespace hi::inline v1 {
52
53template<numeric_limited T, std::size_t N>
56 using value_type = typename container_type::value_type;
57 using size_type = typename container_type::size_type;
58 using difference_type = typename container_type::difference_type;
59 using reference = typename container_type::reference;
60 using const_reference = typename container_type::const_reference;
61 using pointer = typename container_type::pointer;
62 using const_pointer = typename container_type::const_pointer;
63 using iterator = typename container_type::iterator;
64 using const_iterator = typename container_type::const_iterator;
65
66 constexpr static bool is_i8x1 = std::is_same_v<T, int8_t> && N == 1;
67 constexpr static bool is_i8x2 = std::is_same_v<T, int8_t> && N == 2;
68 constexpr static bool is_i8x4 = std::is_same_v<T, int8_t> && N == 4;
69 constexpr static bool is_i8x8 = std::is_same_v<T, int8_t> && N == 8;
70 constexpr static bool is_i8x16 = std::is_same_v<T, int8_t> && N == 16;
71 constexpr static bool is_i8x32 = std::is_same_v<T, int8_t> && N == 32;
72 constexpr static bool is_i8x64 = std::is_same_v<T, int8_t> && N == 64;
73 constexpr static bool is_u8x1 = std::is_same_v<T, uint8_t> && N == 1;
74 constexpr static bool is_u8x2 = std::is_same_v<T, uint8_t> && N == 2;
75 constexpr static bool is_u8x4 = std::is_same_v<T, uint8_t> && N == 4;
76 constexpr static bool is_u8x8 = std::is_same_v<T, uint8_t> && N == 8;
77 constexpr static bool is_u8x16 = std::is_same_v<T, uint8_t> && N == 16;
78 constexpr static bool is_u8x32 = std::is_same_v<T, uint8_t> && N == 32;
79 constexpr static bool is_u8x64 = std::is_same_v<T, uint8_t> && N == 64;
80
81 constexpr static bool is_i16x1 = std::is_same_v<T, int16_t> && N == 1;
82 constexpr static bool is_i16x2 = std::is_same_v<T, int16_t> && N == 2;
83 constexpr static bool is_i16x4 = std::is_same_v<T, int16_t> && N == 4;
84 constexpr static bool is_i16x8 = std::is_same_v<T, int16_t> && N == 8;
85 constexpr static bool is_i16x16 = std::is_same_v<T, int16_t> && N == 16;
86 constexpr static bool is_i16x32 = std::is_same_v<T, int16_t> && N == 32;
87 constexpr static bool is_u16x1 = std::is_same_v<T, uint16_t> && N == 1;
88 constexpr static bool is_u16x2 = std::is_same_v<T, uint16_t> && N == 2;
89 constexpr static bool is_u16x4 = std::is_same_v<T, uint16_t> && N == 4;
90 constexpr static bool is_u16x8 = std::is_same_v<T, uint16_t> && N == 8;
91 constexpr static bool is_u16x16 = std::is_same_v<T, uint16_t> && N == 16;
92 constexpr static bool is_u16x32 = std::is_same_v<T, uint16_t> && N == 32;
93 constexpr static bool is_f16x4 = std::is_same_v<T, float16> && N == 4;
94
95 constexpr static bool is_i32x1 = std::is_same_v<T, int32_t> && N == 1;
96 constexpr static bool is_i32x2 = std::is_same_v<T, int32_t> && N == 2;
97 constexpr static bool is_i32x4 = std::is_same_v<T, int32_t> && N == 4;
98 constexpr static bool is_i32x8 = std::is_same_v<T, int32_t> && N == 8;
99 constexpr static bool is_i32x16 = std::is_same_v<T, int32_t> && N == 16;
100 constexpr static bool is_u32x1 = std::is_same_v<T, uint32_t> && N == 1;
101 constexpr static bool is_u32x2 = std::is_same_v<T, uint32_t> && N == 2;
102 constexpr static bool is_u32x4 = std::is_same_v<T, uint32_t> && N == 4;
103 constexpr static bool is_u32x8 = std::is_same_v<T, uint32_t> && N == 8;
104 constexpr static bool is_u32x16 = std::is_same_v<T, uint32_t> && N == 16;
105 constexpr static bool is_f32x1 = std::is_same_v<T, float> && N == 1;
106 constexpr static bool is_f32x2 = std::is_same_v<T, float> && N == 2;
107 constexpr static bool is_f32x4 = std::is_same_v<T, float> && N == 4;
108 constexpr static bool is_f32x8 = std::is_same_v<T, float> && N == 8;
109 constexpr static bool is_f32x16 = std::is_same_v<T, float> && N == 16;
110
111 constexpr static bool is_i64x1 = std::is_same_v<T, int64_t> && N == 1;
112 constexpr static bool is_i64x2 = std::is_same_v<T, int64_t> && N == 2;
113 constexpr static bool is_i64x4 = std::is_same_v<T, int64_t> && N == 4;
114 constexpr static bool is_i64x8 = std::is_same_v<T, int64_t> && N == 8;
115 constexpr static bool is_u64x1 = std::is_same_v<T, uint64_t> && N == 1;
116 constexpr static bool is_u64x2 = std::is_same_v<T, uint64_t> && N == 2;
117 constexpr static bool is_u64x4 = std::is_same_v<T, uint64_t> && N == 4;
118 constexpr static bool is_u64x8 = std::is_same_v<T, uint64_t> && N == 8;
119 constexpr static bool is_f64x1 = std::is_same_v<T, double> && N == 1;
120 constexpr static bool is_f64x2 = std::is_same_v<T, double> && N == 2;
121 constexpr static bool is_f64x4 = std::is_same_v<T, double> && N == 4;
122 constexpr static bool is_f64x8 = std::is_same_v<T, double> && N == 8;
123
125
126 constexpr numeric_array() noexcept : v()
127 {
128 if (not std::is_constant_evaluated()) {
129#if defined(HI_HAS_AVX)
130 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x16 or is_u16x16 or is_i8x32 or is_u8x32) {
131 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), _mm256_setzero_si256());
132 return;
133 } else if constexpr (is_f64x4) {
134 _mm256_storeu_pd(reinterpret_cast<__m256d *>(v.data()), _mm256_setzero_pd());
135 return;
136 } else if constexpr (is_f32x8) {
137 _mm256_storeu_ps(v.data(), _mm256_setzero_ps());
138 return;
139 }
140#endif
141#if defined(HI_HAS_SSE2)
142 if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_u8x16) {
143 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), _mm_setzero_si128());
144 return;
145 } else if constexpr (is_f64x2) {
146 _mm_storeu_pd(reinterpret_cast<__m128d *>(v.data()), _mm_setzero_pd());
147 return;
148 }
149#endif
150#if defined(HI_HAS_SSE)
151 if constexpr (is_f32x4) {
152 _mm_storeu_ps(v.data(), _mm_setzero_ps());
153 return;
154 }
155#endif
156 }
157
158 for (auto i = 0_uz; i != N; ++i) {
159 v[i] = T{};
160 }
161 }
162
163 constexpr numeric_array(numeric_array const &rhs) noexcept = default;
164 constexpr numeric_array(numeric_array &&rhs) noexcept = default;
165 constexpr numeric_array &operator=(numeric_array const &rhs) noexcept = default;
166 constexpr numeric_array &operator=(numeric_array &&rhs) noexcept = default;
167
168 template<numeric_limited U, std::size_t M>
169 [[nodiscard]] constexpr explicit numeric_array(numeric_array<U, M> const &other) noexcept : v()
170 {
171 if (!std::is_constant_evaluated()) {
172#if defined(HI_HAS_AVX)
173 if constexpr (is_f64x4 and other.is_f32x4) {
174 v = numeric_array{_mm256_cvteps_pd(other.reg())};
175 return;
176 } else if constexpr (is_f64x4 and other.is_i32x4) {
177 v = numeric_array{_mm256_cvtepi32_pd(other.reg())};
178 return;
179 } else if constexpr (is_f32x4 and other.is_f64x4) {
180 v = numeric_array{_mm256_cvtpd_ps(other.reg())};
181 return;
182 } else if constexpr (is_i32x4 and other.is_f64x4) {
183 v = numeric_array{_mm256_cvtpd_epi32(other.reg())};
184 return;
185 } else if constexpr (is_i32x8 and other.is_f32x8) {
186 v = numeric_array{_mm256_cvtps_epi32(other.reg())};
187 return;
188 } else if constexpr (is_f32x8 and other.is_i32x8) {
189 v = numeric_array{_mm256_cvtepi32_ps(other.reg())};
190 return;
191 }
192#endif
193#if defined(HI_HAS_SSE4_1)
194 if constexpr (is_u8x4 and other.is_f32x4) {
195 hilet i32_4 = _mm_cvtps_epi32(other.reg());
196 hilet i16_8 = _mm_packs_epi32(i32_4, _mm_setzero_si128());
197 hilet u8_16 = _mm_packus_epi16(i16_8, _mm_setzero_si128());
198 v = numeric_array{u8_16};
199 return;
200 } else if constexpr (is_i64x4 and other.is_i32x4) {
201 v = numeric_array{_mm_cvtepi32_epi64(other.reg())};
202 return;
203 } else if constexpr (is_i64x4 and other.is_i16x8) {
204 v = numeric_array{_mm_cvtepi16_epi64(other.reg())};
205 return;
206 } else if constexpr (is_i32x4 and other.is_i16x8) {
207 v = numeric_array{_mm_cvtepi16_epi32(other.reg())};
208 return;
209 } else if constexpr (is_i64x2 and other.is_i8x16) {
210 v = numeric_array{_mm_cvtepi8_epi64(other.reg())};
211 return;
212 } else if constexpr (is_i32x4 and other.is_i8x16) {
213 v = numeric_array{_mm_cvtepi8_epi32(other.reg())};
214 return;
215 } else if constexpr (is_i16x8 and other.is_i8x16) {
216 v = numeric_array{_mm_cvtepi8_epi16(other.reg())};
217 return;
218 } else if constexpr (is_f16x4 and other.is_f32x4) {
219 v = numeric_array{_mm_cvtps_ph_sse4_1(other.reg())};
220 return;
221 } else if constexpr (is_f32x4 and other.is_f16x4) {
222 v = numeric_array{_mm_cvtph_ps_sse2(other.reg())};
223 return;
224 }
225
226#endif
227#if defined(HI_HAS_SSE2)
228 if constexpr (is_f64x2 and other.is_i32x4) {
229 v = numeric_array{_mm_cvtepi32_pd(other.reg())};
230 return;
231 } else if constexpr (is_f32x4 and other.is_i32x4) {
232 v = numeric_array{_mm_cvtepi32_ps(other.reg())};
233 return;
234 } else if constexpr (is_i32x4 and other.is_f32x4) {
235 v = numeric_array{_mm_cvtps_epi32(other.reg())};
236 return;
237 }
238#endif
239 }
240
241 for (std::size_t i = 0; i != N; ++i) {
242 if (i < M) {
243 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
244 // SSE conversion round floats before converting to integer.
245 v[i] = static_cast<value_type>(std::round(other[i]));
246 } else {
247 v[i] = static_cast<value_type>(other[i]);
248 }
249 } else {
250 v[i] = T{};
251 }
252 }
253 }
254
255 template<numeric_limited U, std::size_t M>
256 [[nodiscard]] constexpr explicit numeric_array(numeric_array<U, M> const &other1, numeric_array<U, M> const &other2) noexcept
257 :
258 v()
259 {
260 if (!std::is_constant_evaluated()) {
261#if defined(HI_HAS_AVX)
262 if constexpr (is_f64x4 and other1.is_f64x2 and other2.is_f64x2) {
263 v = numeric_array{_mm256_set_m128d(other2.reg(), other1.reg())};
264 } else if constexpr (is_f32x8 and other1.is_f32x4 and other2.is_f32x4) {
265 v = numeric_array{_mm256_set_m128(other2.reg(), other1.reg())};
266 } else if constexpr (
267 std::is_integral_v<T> and std::is_integral_v<U> and (sizeof(T) * N == 32) and (sizeof(U) * M == 16)) {
268 v = numeric_array{_mm256_set_m128i(other2.reg(), other1.reg())};
269 }
270#endif
271#if defined(HI_HAS_SSE4_1)
272 if constexpr (is_u16x8 and other1.is_u32x4 and other2.is_u32x4) {
273 v = numeric_array{_mm_packus_epu32(other2.reg(), other1.reg())};
274 }
275#endif
276#if defined(HI_HAS_SSE2)
277 if constexpr (is_i16x8 and other1.is_i32x4 and other2.is_i32x4) {
278 v = numeric_array{_mm_packs_epi32(other2.reg(), other1.reg())};
279 } else if constexpr (is_i8x16 and other1.is_i16x8 and other2.is_i16x8) {
280 v = numeric_array{_mm_packs_epi16(other2.reg(), other1.reg())};
281 } else if constexpr (is_u8x16 and other1.is_u16x8 and other2.is_u16x8) {
282 v = numeric_array{_mm_packus_epu16(other2.reg(), other1.reg())};
283 }
284#endif
285 }
286
287 for (std::size_t i = 0; i != N; ++i) {
288 if (i < M) {
289 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
290 // SSE conversion round floats before converting to integer.
291 v[i] = static_cast<value_type>(std::round(other1[i]));
292 } else {
293 v[i] = static_cast<value_type>(other1[i]);
294 }
295 } else if (i < M * 2) {
296 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
297 // SSE conversion round floats before converting to integer.
298 v[i] = static_cast<value_type>(std::round(other2[i - M]));
299 } else {
300 v[i] = static_cast<value_type>(other2[i - M]);
301 }
302 } else {
303 v[i] = U{};
304 }
305 }
306 }
307
308 [[nodiscard]] constexpr explicit numeric_array(T const &x) noexcept : v()
309 {
310 if (not std::is_constant_evaluated()) {
311#if defined(HI_HAS_SSE)
312 if constexpr (is_f32x4) {
313 *this = numeric_array{_mm_set_ss(x)};
314 return;
315 }
316#endif
317 }
318 get<0>(v) = x;
319 }
320
321 [[nodiscard]] constexpr explicit numeric_array(T const &x, T const &y) noexcept requires(N >= 2) : v()
322 {
323 if (not std::is_constant_evaluated()) {
324#if defined(HI_HAS_SSE2)
325 if constexpr (is_i32x4) {
326 *this = numeric_array{_mm_set_epi32(0, 0, y, x)};
327 return;
328 }
329#endif
330 }
331 get<0>(v) = x;
332 get<1>(v) = y;
333 }
334
335 [[nodiscard]] constexpr explicit numeric_array(T const &x, T const &y, T const &z) noexcept requires(N >= 3) : v()
336 {
337 if (not std::is_constant_evaluated()) {
338#if defined(HI_HAS_SSE2)
339 if constexpr (is_i32x4) {
340 *this = numeric_array{_mm_set_epi32(0, z, y, x)};
341 return;
342 }
343#endif
344 }
345 get<0>(v) = x;
346 get<1>(v) = y;
347 get<2>(v) = z;
348 }
349
350 [[nodiscard]] constexpr explicit numeric_array(T const &x, T const &y, T const &z, T const &w) noexcept requires(N >= 4) : v()
351 {
352 if (not std::is_constant_evaluated()) {
353#if defined(HI_HAS_SSE2)
354 if constexpr (is_i32x4) {
355 *this = numeric_array{_mm_set_epi32(w, z, y, x)};
356 return;
357 }
358#endif
359 }
360 get<0>(v) = x;
361 get<1>(v) = y;
362 get<2>(v) = z;
363 get<3>(v) = w;
364 }
365
366 [[nodiscard]] static constexpr numeric_array broadcast(T rhs) noexcept
367 {
368 if (not std::is_constant_evaluated()) {
369#if defined(HI_HAS_AVX)
370 if constexpr (is_f64x4) {
371 return numeric_array{_mm256_set1_pd(rhs)};
372 } else if constexpr (is_f32x8) {
373 return numeric_array{_mm256_set1_ps(rhs)};
374 } else if constexpr (is_i64x4) {
375 return numeric_array{_mm256_set1_epi64x(rhs)};
376 } else if constexpr (is_i32x8) {
377 return numeric_array{_mm256_set1_epi32(rhs)};
378 } else if constexpr (is_i16x16) {
379 return numeric_array{_mm256_set1_epi16(rhs)};
380 } else if constexpr (is_i8x32) {
381 return numeric_array{_mm256_set1_epi8(rhs)};
382 }
383#endif
384#if defined(HI_HAS_SSE2)
385 if constexpr (is_f64x2) {
386 return numeric_array{_mm_set1_pd(rhs)};
387 } else if constexpr (is_i64x2) {
388 return numeric_array{_mm_set1_epi64x(rhs)};
389 } else if constexpr (is_i32x4) {
390 return numeric_array{_mm_set1_epi32(rhs)};
391 } else if constexpr (is_i16x8) {
392 return numeric_array{_mm_set1_epi16(rhs)};
393 } else if constexpr (is_i8x16) {
394 return numeric_array{_mm_set1_epi8(rhs)};
395 }
396#endif
397#if defined(HI_HAS_SSE)
398 if constexpr (is_f32x4) {
399 return numeric_array{_mm_set1_ps(rhs)};
400 }
401#endif
402 }
403 auto r = numeric_array{};
404 for (std::size_t i = 0; i != N; ++i) {
405 r[i] = rhs;
406 }
407 return r;
408 }
409
410 [[nodiscard]] static constexpr numeric_array epsilon() noexcept
411 {
412 if constexpr (std::is_floating_point_v<T>) {
413 return broadcast(std::numeric_limits<T>::min());
414 } else {
415 return broadcast(T{0});
416 }
417 }
418
419 [[nodiscard]] numeric_array(std::array<T, N> const &rhs) noexcept : v(rhs) {}
420
421 numeric_array &operator=(std::array<T, N> const &rhs) noexcept
422 {
423 v = rhs;
424 return *this;
425 }
426
427 [[nodiscard]] operator std::array<T, N>() const noexcept
428 {
429 return v;
430 }
431
432#if defined(HI_HAS_SSE2)
433 [[nodiscard]] __m128i reg() const noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
434 {
435 return _mm_loadu_si128(reinterpret_cast<__m128i const *>(v.data()));
436 }
437
438 [[nodiscard]] __m128i reg() const noexcept requires(is_f16x4)
439 {
440 return _mm_set_epi16(0, 0, 0, 0, get<3>(v).get(), get<2>(v).get(), get<1>(v).get(), get<0>(v).get());
441 }
442#endif
443
444#if defined(HI_HAS_SSE2)
445 [[nodiscard]] __m128 reg() const noexcept requires(is_f32x4)
446 {
447 return _mm_loadu_ps(v.data());
448 }
449#endif
450
451#if defined(HI_HAS_SSE2)
452 [[nodiscard]] __m128d reg() const noexcept requires(is_f64x2)
453 {
454 return _mm_loadu_pd(v.data());
455 }
456#endif
457
458#if defined(HI_HAS_SSE2)
459 [[nodiscard]] explicit numeric_array(__m128i const &rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
460 {
461 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), rhs);
462 }
463#endif
464
465#if defined(HI_HAS_SSE4_1)
466 [[nodiscard]] explicit numeric_array(__m128i const &rhs) noexcept requires(is_f16x4) :
467 v(std::bit_cast<decltype(v)>(_mm_extract_epi64(rhs, 0)))
468 {
469 }
470#endif
471
472#if defined(HI_HAS_SSE4_1)
473 [[nodiscard]] explicit numeric_array(__m128i const &rhs) noexcept requires(is_u8x4) :
474 v(std::bit_cast<decltype(v)>(_mm_extract_epi32(rhs, 0)))
475 {
476 }
477#endif
478
479#if defined(HI_HAS_SSE2)
480 [[nodiscard]] explicit numeric_array(__m128 const &rhs) noexcept requires(is_f32x4)
481 {
482 _mm_storeu_ps(v.data(), rhs);
483 }
484#endif
485
486#if defined(HI_HAS_SSE2)
487 [[nodiscard]] explicit numeric_array(__m128d const &rhs) noexcept requires(is_f64x2)
488 {
489 _mm_storeu_pd(v.data(), rhs);
490 }
491#endif
492
493#if defined(HI_HAS_SSE2)
494 numeric_array &operator=(__m128i const &rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
495 {
496 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), rhs);
497 return *this;
498 }
499#endif
500
501#if defined(HI_HAS_SSE2)
502 numeric_array &operator=(__m128 const &rhs) noexcept requires(is_f32x4)
503 {
504 _mm_storeu_ps(v.data(), rhs);
505 return *this;
506 }
507#endif
508
509#if defined(HI_HAS_SSE2)
510 numeric_array &operator=(__m128d const &rhs) noexcept requires(is_f64x2)
511 {
512 _mm_storeu_pd(v.data(), rhs);
513 return *this;
514 }
515#endif
516
517#if defined(HI_HAS_AVX)
518 [[nodiscard]] __m256i reg() const noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
519 {
520 return _mm256_loadu_si256(reinterpret_cast<__m256i const *>(v.data()));
521 }
522#endif
523
524#if defined(HI_HAS_AVX)
525 [[nodiscard]] __m256 reg() const noexcept requires(is_f32x8)
526 {
527 return _mm256_loadu_ps(v.data());
528 }
529#endif
530
531#if defined(HI_HAS_AVX)
532 [[nodiscard]] __m256d reg() const noexcept requires(is_f64x4)
533 {
534 return _mm256_loadu_pd(v.data());
535 }
536#endif
537
538#if defined(HI_HAS_AVX)
539 [[nodiscard]] explicit numeric_array(__m256i const &rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
540 {
541 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), rhs);
542 }
543#endif
544
545#if defined(HI_HAS_AVX)
546 [[nodiscard]] explicit numeric_array(__m256 const &rhs) noexcept requires(is_f32x8)
547 {
548 _mm256_storeu_ps(v.data(), rhs);
549 }
550#endif
551
552#if defined(HI_HAS_AVX)
553 [[nodiscard]] explicit numeric_array(__m256d const &rhs) noexcept requires(is_f64x4)
554 {
555 _mm256_storeu_pd(v.data(), rhs);
556 }
557#endif
558
559#if defined(HI_HAS_AVX)
560 numeric_array &operator=(__m256i const &rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
561 {
562 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), rhs);
563 return *this;
564 }
565#endif
566
567#if defined(HI_HAS_AVX)
568 numeric_array &operator=(__m256 const &rhs) noexcept requires(is_f32x8)
569 {
570 _mm256_storeu_ps(v.data(), rhs);
571 return *this;
572 }
573#endif
574
575#if defined(HI_HAS_AVX)
576 numeric_array &operator=(__m256d const &rhs) noexcept requires(is_f64x4)
577 {
578 _mm256_storeu_pd(v.data(), rhs);
579 return *this;
580 }
581#endif
582
583 template<typename Other>
584 [[nodiscard]] constexpr friend Other bit_cast(numeric_array const &rhs) noexcept
585 requires(sizeof(Other) == sizeof(container_type))
586 {
587 if (not std::is_constant_evaluated()) {
588#if defined(HI_HAS_SSE2)
589 if constexpr (Other::is_f32x4 and std::is_integral_v<T>) {
590 return Other{_mm_castsi128_ps(rhs.reg())};
591 } else if constexpr (Other::is_f32x4 and is_f64x2) {
592 return Other{_mm_castpd_ps(rhs.reg())};
593 } else if constexpr (Other::is_f64x2 and std::is_integral_v<T>) {
594 return Other{_mm_castsi128_pd(rhs.reg())};
595 } else if constexpr (Other::is_f64x2 and is_f32x4) {
596 return Other{_mm_castps_pd(rhs.reg())};
597 } else if constexpr (std::is_integral_v<Other::value_type> and is_f32x4) {
598 return Other{_mm_castps_si128(rhs.reg())};
599 } else if constexpr (std::is_integral_v<Other::value_type> and is_f64x2) {
600 return Other{_mm_castpd_si128(rhs.reg())};
601 } else if constexpr (std::is_integral_v<Other::value_type> and std::is_integral_v<T>) {
602 return Other{rhs.reg()};
603 }
604#endif
605 }
606 return std::bit_cast<Other>(rhs);
607 }
608
612 {
613 if (not std::is_constant_evaluated()) {
614#if defined(HI_HAS_SSE2)
615 if constexpr (is_f64x2) {
616 return numeric_array{_mm_unpacklo_pd(a.reg(), b.reg())};
617 } else if constexpr (is_i64x2 or is_u64x2) {
618 return numeric_array{_mm_unpacklo_epi64(a.reg(), b.reg())};
619 } else if constexpr (is_i32x4 or is_u32x4) {
620 return numeric_array{_mm_unpacklo_epi32(a.reg(), b.reg())};
621 } else if constexpr (is_i16x8 or is_u16x8) {
622 return numeric_array{_mm_unpacklo_epi16(a.reg(), b.reg())};
623 } else if constexpr (is_i8x16 or is_u8x16) {
624 return numeric_array{_mm_unpacklo_epi8(a.reg(), b.reg())};
625 }
626#endif
627#if defined(HI_HAS_SSE)
628 if constexpr (is_f32x4) {
629 return numeric_array{_mm_unpacklo_ps(a.reg(), b.reg())};
630 }
631#endif
632 }
633
634 auto r = numeric_array{};
635 for (std::size_t i = 0; i != N; ++i) {
636 r[i] = (i % 2 == 0) ? a[i / 2] : b[i / 2];
637 }
638 return r;
639 }
640
645 template<std::size_t S>
646 [[nodiscard]] static constexpr numeric_array load(std::byte const *ptr) noexcept
647 {
648 auto r = numeric_array{};
649 std::memcpy(&r, ptr, S);
650 return r;
651 }
652
657 [[nodiscard]] static constexpr numeric_array load(std::byte const *ptr) noexcept
658 {
659 auto r = numeric_array{};
660 std::memcpy(&r, ptr, sizeof(r));
661 return r;
662 }
663
668 [[nodiscard]] static constexpr numeric_array load(T const *ptr) noexcept
669 {
670 auto r = numeric_array{};
671 std::memcpy(&r, ptr, sizeof(r));
672 return r;
673 }
674
675 template<std::size_t S>
676 constexpr void store(std::byte *ptr) const noexcept
677 {
678 std::memcpy(ptr, this, S);
679 }
680
684 constexpr void store(std::byte *ptr) const noexcept
685 {
686 store<sizeof(*this)>(ptr);
687 }
688
692 constexpr explicit operator bool() const noexcept
693 {
694 if constexpr (std::is_floating_point_v<T>) {
695 hilet ep = epsilon();
696 // check if any of the elements is outside of epsilon range,
697 return static_cast<bool>(gt(-ep, *this) | gt(*this, ep));
698 } else {
699 return static_cast<bool>(ne(*this, T{0}));
700 }
701 }
702
703 [[nodiscard]] constexpr T const &operator[](std::size_t i) const noexcept
704 {
705 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
706 hi_axiom(i < N);
707 return v[i];
708 }
709
710 [[nodiscard]] constexpr T &operator[](std::size_t i) noexcept
711 {
712 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
713 hi_axiom(i < N);
714 return v[i];
715 }
716
717 [[nodiscard]] constexpr reference front() noexcept
718 {
719 return v.front();
720 }
721
722 [[nodiscard]] constexpr const_reference front() const noexcept
723 {
724 return v.front();
725 }
726
727 [[nodiscard]] constexpr reference back() noexcept
728 {
729 return v.back();
730 }
731
732 [[nodiscard]] constexpr const_reference back() const noexcept
733 {
734 return v.back();
735 }
736
737 [[nodiscard]] constexpr pointer data() noexcept
738 {
739 return v.data();
740 }
741
742 [[nodiscard]] constexpr const_pointer data() const noexcept
743 {
744 return v.data();
745 }
746
747 [[nodiscard]] constexpr iterator begin() noexcept
748 {
749 return v.begin();
750 }
751
752 [[nodiscard]] constexpr const_iterator begin() const noexcept
753 {
754 return v.begin();
755 }
756
757 [[nodiscard]] constexpr const_iterator cbegin() const noexcept
758 {
759 return v.cbegin();
760 }
761
762 [[nodiscard]] constexpr iterator end() noexcept
763 {
764 return v.end();
765 }
766
767 [[nodiscard]] constexpr const_iterator end() const noexcept
768 {
769 return v.end();
770 }
771
772 [[nodiscard]] constexpr const_iterator cend() const noexcept
773 {
774 return v.cend();
775 }
776
777 [[nodiscard]] constexpr bool empty() const noexcept
778 {
779 return v.empty();
780 }
781
782 [[nodiscard]] constexpr size_type size() const noexcept
783 {
784 return v.size();
785 }
786
787 [[nodiscard]] constexpr size_type max_size() const noexcept
788 {
789 return v.max_size();
790 }
791
792 constexpr bool is_point() const noexcept
793 {
794 return v.back() != T{};
795 }
796
797 constexpr bool is_vector() const noexcept
798 {
799 return v.back() == T{};
800 }
801
802 constexpr bool is_opaque() const noexcept
803 {
804 return a() == T{1};
805 }
806
807 constexpr bool is_transparent() const noexcept
808 {
809 return a() == T{0};
810 }
811
812 [[nodiscard]] constexpr T const &x() const noexcept requires(N >= 1)
813 {
814 return std::get<0>(v);
815 }
816
817 [[nodiscard]] constexpr T const &y() const noexcept requires(N >= 2)
818 {
819 return std::get<1>(v);
820 }
821
822 [[nodiscard]] constexpr T const &z() const noexcept requires(N >= 3)
823 {
824 return std::get<2>(v);
825 }
826
827 [[nodiscard]] constexpr T const &w() const noexcept requires(N >= 4)
828 {
829 return std::get<3>(v);
830 }
831
832 [[nodiscard]] constexpr T &x() noexcept requires(N >= 1)
833 {
834 return std::get<0>(v);
835 }
836
837 [[nodiscard]] constexpr T &y() noexcept requires(N >= 2)
838 {
839 return std::get<1>(v);
840 }
841
842 [[nodiscard]] constexpr T &z() noexcept requires(N >= 3)
843 {
844 return std::get<2>(v);
845 }
846
847 [[nodiscard]] constexpr T &w() noexcept requires(N >= 4)
848 {
849 return std::get<3>(v);
850 }
851
852 [[nodiscard]] constexpr T const &r() const noexcept requires(N >= 1)
853 {
854 return std::get<0>(v);
855 }
856
857 [[nodiscard]] constexpr T const &g() const noexcept requires(N >= 2)
858 {
859 return std::get<1>(v);
860 }
861
862 [[nodiscard]] constexpr T const &b() const noexcept requires(N >= 3)
863 {
864 return std::get<2>(v);
865 }
866
867 [[nodiscard]] constexpr T const &a() const noexcept requires(N >= 4)
868 {
869 return std::get<3>(v);
870 }
871
872 [[nodiscard]] constexpr T &r() noexcept requires(N >= 1)
873 {
874 return std::get<0>(v);
875 }
876
877 [[nodiscard]] constexpr T &g() noexcept requires(N >= 2)
878 {
879 return std::get<1>(v);
880 }
881
882 [[nodiscard]] constexpr T &b() noexcept requires(N >= 3)
883 {
884 return std::get<2>(v);
885 }
886
887 [[nodiscard]] constexpr T &a() noexcept requires(N >= 4)
888 {
889 return std::get<3>(v);
890 }
891
892 [[nodiscard]] constexpr T const &width() const noexcept requires(N >= 1)
893 {
894 return std::get<0>(v);
895 }
896
897 [[nodiscard]] constexpr T const &height() const noexcept requires(N >= 2)
898 {
899 return std::get<1>(v);
900 }
901
902 [[nodiscard]] constexpr T const &depth() const noexcept requires(N >= 3)
903 {
904 return std::get<2>(v);
905 }
906
907 [[nodiscard]] constexpr T &width() noexcept requires(N >= 1)
908 {
909 return std::get<0>(v);
910 }
911
912 [[nodiscard]] constexpr T &height() noexcept requires(N >= 2)
913 {
914 return std::get<1>(v);
915 }
916
917 [[nodiscard]] constexpr T &depth() noexcept requires(N >= 3)
918 {
919 return std::get<2>(v);
920 }
921
922 constexpr numeric_array &operator<<=(unsigned int rhs) noexcept
923 {
924 return *this = *this << rhs;
925 }
926
927 constexpr numeric_array &operator>>=(unsigned int rhs) noexcept
928 {
929 return *this = *this >> rhs;
930 }
931
932 constexpr numeric_array &operator|=(numeric_array const &rhs) noexcept
933 {
934 return *this = *this | rhs;
935 }
936
937 constexpr numeric_array &operator|=(T const &rhs) noexcept
938 {
939 return *this = *this | rhs;
940 }
941
942 constexpr numeric_array &operator&=(numeric_array const &rhs) noexcept
943 {
944 return *this = *this & rhs;
945 }
946
947 constexpr numeric_array &operator&=(T const &rhs) noexcept
948 {
949 return *this = *this & rhs;
950 }
951
952 constexpr numeric_array &operator^=(numeric_array const &rhs) noexcept
953 {
954 return *this = *this ^ rhs;
955 }
956
957 constexpr numeric_array &operator^=(T const &rhs) noexcept
958 {
959 return *this = *this ^ rhs;
960 }
961
962 constexpr numeric_array &operator+=(numeric_array const &rhs) noexcept
963 {
964 return *this = *this + rhs;
965 }
966
967 constexpr numeric_array &operator+=(T const &rhs) noexcept
968 {
969 return *this = *this + rhs;
970 }
971
972 constexpr numeric_array &operator-=(numeric_array const &rhs) noexcept
973 {
974 return *this = *this - rhs;
975 }
976
977 constexpr numeric_array &operator-=(T const &rhs) noexcept
978 {
979 return *this = *this - rhs;
980 }
981
982 constexpr numeric_array &operator*=(numeric_array const &rhs) noexcept
983 {
984 return *this = *this * rhs;
985 }
986
987 constexpr numeric_array &operator*=(T const &rhs) noexcept
988 {
989 return *this = *this * rhs;
990 }
991
992 constexpr numeric_array &operator/=(numeric_array const &rhs) noexcept
993 {
994 return *this = *this / rhs;
995 }
996
997 constexpr numeric_array &operator/=(T const &rhs) noexcept
998 {
999 return *this = *this / rhs;
1000 }
1001
1002 constexpr numeric_array &operator%=(numeric_array const &rhs) noexcept
1003 {
1004 return *this = *this % rhs;
1005 }
1006
1007 constexpr numeric_array &operator%=(T const &rhs) noexcept
1008 {
1009 return *this = *this % rhs;
1010 }
1011
1012 constexpr static ssize_t get_zero = -1;
1013 constexpr static ssize_t get_one = -2;
1014
1019 template<std::size_t I>
1020 [[nodiscard]] friend constexpr T &get(numeric_array &rhs) noexcept
1021 {
1022 static_assert(I < N, "Index out of bounds");
1023 return std::get<I>(rhs.v);
1024 }
1025
1031 template<ssize_t I>
1032 [[nodiscard]] friend constexpr T get(numeric_array &&rhs) noexcept
1033 {
1034 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
1035 static_assert(I >= -2 && I < narrow_cast<ssize_t>(N), "Index out of bounds");
1036 if constexpr (I == get_zero) {
1037 return T{0};
1038 } else if constexpr (I == get_one) {
1039 return T{1};
1040 } else {
1041 return std::get<I>(rhs.v);
1042 }
1043 }
1044
1051 template<std::size_t I>
1052 [[nodiscard]] constexpr friend T extract(numeric_array const &rhs) noexcept
1053 {
1054 static_assert(I < N);
1055
1056 if (not std::is_constant_evaluated()) {
1057#if defined(HI_HAS_AVX2)
1058 if constexpr (is_i16x16 or is_u16x16) {
1059 return static_cast<T>(_mm256_extract_epi16(rhs.v.reg(), I));
1060 } else if constexpr (is_i8x32 or is_u8x32) {
1061 return static_cast<T>(_mm256_extract_epi8(rhs.v.reg(), I));
1062 }
1063#endif
1064#if defined(HI_HAS_AVX)
1065 if constexpr (is_f64x4) {
1066 return bit_cast<T>(_mm256_extract_epi64(_mm256_castpd_si256(rhs.v.reg()), I));
1067 } else if constexpr (is_f32x8) {
1068 return bit_cast<T>(_mm256_extract_epi32(_mm256_castps_si256(rhs.v.reg()), I));
1069 } else if constexpr (is_i64x4 or is_u64x4) {
1070 return static_cast<T>(_mm256_extract_epi64(rhs.v.reg(), I));
1071 } else if constexpr (is_i32x8 or is_u32x8) {
1072 return static_cast<T>(_mm256_extract_epi32(rhs.v.reg(), I));
1073 }
1074#endif
1075#if defined(HI_HAS_SSE4_1)
1076 if constexpr (is_f64x2) {
1077 return bit_cast<T>(_mm_extract_epi64(_mm_castpd_si128(rhs.v.reg()), I));
1078 } else if constexpr (is_f32x4) {
1079 return std::bit_cast<T>(_mm_extract_ps(rhs.v.reg(), I));
1080 } else if constexpr (is_i64x2 or is_u64x2) {
1081 return static_cast<T>(_mm_extract_epi64(rhs.v.reg(), I));
1082 } else if constexpr (is_i32x4 or is_u32x4) {
1083 return static_cast<T>(_mm_extract_epi32(rhs.v.reg(), I));
1084 } else if constexpr (is_i8x16 or is_u8x16) {
1085 return static_cast<T>(_mm_extract_epi8(rhs.v.reg(), I));
1086 }
1087#endif
1088#if defined(HI_HAS_SSE2)
1089 if constexpr (is_i16x8 or is_u16x8) {
1090 return static_cast<T>(_mm_extract_epi16(rhs.v.reg(), I));
1091 }
1092#endif
1093 }
1094
1095 return get<I>(rhs);
1096 }
1097
1106 template<std::size_t I, std::size_t ZeroMask = 0>
1107 [[nodiscard]] constexpr friend numeric_array insert(numeric_array const &lhs, T rhs) noexcept
1108 requires(is_f32x4 or is_i32x4 or is_u32x4)
1109 {
1110 static_assert(I < N);
1111 static_assert(ZeroMask <= ((1 << N) - 1));
1112
1113 if (not std::is_constant_evaluated()) {
1114#if defined(HI_HAS_SSE4_1)
1115 if constexpr (is_f32x4) {
1116 constexpr int imm8 = (I << 4) | ZeroMask;
1117 return numeric_array{_mm_insert_ps(lhs.reg(), _mm_set_ss(rhs), imm8)};
1118 } else if constexpr (is_i32x4 or is_u32x4) {
1119 constexpr int imm8 = (I << 4) | ZeroMask;
1120 return numeric_array{
1121 _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(_mm_set1_epi32(rhs)), imm8))};
1122 }
1123#endif
1124 }
1125
1126 auto r = lhs;
1127 std::get<I>(r.v) = rhs;
1128 for (std::size_t i = 0; i != N; ++i) {
1129 if ((ZeroMask >> i) & 1) {
1130 r.v[i] = T{};
1131 }
1132 }
1133 return r;
1134 }
1135
1141 template<ssize_t I>
1142 [[nodiscard]] friend constexpr T get(numeric_array const &rhs) noexcept
1143 {
1144 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
1145 static_assert(I >= -2 && I < narrow_cast<ssize_t>(N), "Index out of bounds");
1146 if constexpr (I == get_zero) {
1147 return T{0};
1148 } else if constexpr (I == get_one) {
1149 return T{1};
1150 } else {
1151 return std::get<I>(rhs.v);
1152 }
1153 }
1154
1159 template<std::size_t Mask = ~std::size_t{0}>
1160 [[nodiscard]] friend constexpr numeric_array zero(numeric_array rhs) noexcept
1161 {
1162 if (not std::is_constant_evaluated()) {
1163#if defined(HI_HAS_SSE4_1)
1164 if constexpr (is_f32x4) {
1165 return numeric_array{_mm_insert_ps(rhs.reg(), rhs.reg(), Mask)};
1166 } else if constexpr (is_i32x4 or is_u32x4) {
1167 return numeric_array{
1168 _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(rhs.reg()), _mm_castsi128_ps(rhs.reg()), Mask))};
1169 }
1170#endif
1171 }
1172
1173 auto r = numeric_array{};
1174 for (std::size_t i = 0; i != N; ++i) {
1175 if (static_cast<bool>((Mask >> i) & 1)) {
1176 r.v[i] = T{0};
1177 } else {
1178 r.v[i] = rhs.v[i];
1179 }
1180 }
1181 return r;
1182 }
1183
1184 template<std::size_t Mask>
1185 [[nodiscard]] friend constexpr numeric_array blend(numeric_array const &lhs, numeric_array const &rhs) noexcept
1186 {
1187 if (not std::is_constant_evaluated()) {
1188#if defined(HI_HAS_AVX2)
1189 if constexpr (is_i32x8) {
1190 return numeric_array{_mm256_blend_epi32(lhs.reg(), rhs.reg(), Mask)};
1191 } else if constexpr (is_i32x4 or is_u32x4) {
1192 return numeric_array{_mm_blend_epi32(lhs.reg(), rhs.reg(), Mask)};
1193 } else if constexpr (is_i16x16 or is_u16x16) {
1194 return numeric_array{_mm256_blend_epi16(lhs.reg(), rhs.reg(), Mask)};
1195 }
1196#endif
1197#if defined(HI_HAS_AVX)
1198 if constexpr (is_f64x4) {
1199 return numeric_array{_mm256_blend_pd(lhs.reg(), rhs.reg(), Mask)};
1200 } else if constexpr (is_f32x8) {
1201 return numeric_array{_mm256_blend_ps(lhs.reg(), rhs.reg(), Mask)};
1202 } else if constexpr (is_i64x4 or is_u64x4) {
1203 return numeric_array{
1204 _mm256_castpd_si256(_mm256_blend_pd(_mm256_castsi256_pd(lhs.reg()), _mm256_castsi256_pd(rhs.reg()), Mask))};
1205 } else if constexpr (is_i32x8 or is_u32x8) {
1206 return numeric_array{
1207 _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg()), Mask))};
1208 }
1209#endif
1210#if defined(HI_HAS_SSE4_1)
1211 if constexpr (is_f64x2) {
1212 return numeric_array{_mm_blend_pd(lhs.reg(), rhs.reg(), Mask)};
1213 } else if constexpr (is_f32x4) {
1214 return numeric_array{_mm_blend_ps(lhs.reg(), rhs.reg(), Mask)};
1215 } else if constexpr (is_i64x2 or is_u64x2) {
1216 return numeric_array{
1217 _mm_castpd_si128(_mm_blend_pd(_mm_castsi128_pd(lhs.reg()), _mm_castsi128_pd(rhs.reg()), Mask))};
1218 } else if constexpr (is_i32x4 or is_u32x4) {
1219 return numeric_array{
1220 _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg()), Mask))};
1221 } else if constexpr (is_i16x8 or is_u16x8) {
1222 return numeric_array{_mm_blend_epi16(lhs.reg(), rhs.reg(), Mask)};
1223 }
1224#endif
1225 }
1226
1227 auto r = numeric_array{};
1228 for (std::size_t i = 0; i != N; ++i) {
1229 r[i] = static_cast<bool>((Mask >> i) & 1) ? rhs[i] : lhs[i];
1230 }
1231 return r;
1232 }
1233
1236 [[nodiscard]] friend constexpr numeric_array blend(numeric_array const &a, numeric_array const &b, numeric_array const &mask)
1237 {
1238 if (not std::is_constant_evaluated()) {
1239#if defined(HI_HAS_AVX2)
1240 if constexpr (is_i8x32 or is_u8x32) {
1241 return numeric_array{_mm256_blendv_epi8(a.reg(), b.reg(), mask.reg())};
1242 }
1243#endif
1244#if defined(HI_HAS_AVX)
1245 if constexpr (is_f64x4) {
1246 return numeric_array{_mm256_blendv_pd(a.reg(), b.reg(), mask.reg())};
1247 } else if constexpr (is_f32x8) {
1248 return numeric_array{_mm256_blendv_ps(a.reg(), b.reg(), mask.reg())};
1249 } else if constexpr (is_i64x4 or is_u64x4) {
1250 return numeric_array{_mm256_castpd_si256(_mm256_blendv_pd(
1251 _mm256_castsi256_pd(a.reg()), _mm256_castsi256_pd(b.reg()), _mm256_castsi256_pd(mask.reg())))};
1252 } else if constexpr (is_i32x8 or is_u32x8) {
1253 return numeric_array{_mm256_castps_si256(_mm256_blendv_ps(
1254 _mm256_castsi256_ps(a.reg()), _mm256_castsi256_ps(b.reg()), _mm256_castsi256_ps(mask.reg())))};
1255 }
1256#endif
1257#if defined(HI_HAS_SSE4_1)
1258 if constexpr (is_f64x2) {
1259 return numeric_array{_mm_blendv_pd(a.reg(), b.reg(), mask.reg())};
1260 } else if constexpr (is_f32x4) {
1261 return numeric_array{_mm_blendv_ps(a.reg(), b.reg(), mask.reg())};
1262 } else if constexpr (is_i64x2 or is_u64x2) {
1263 return numeric_array{_mm_castpd_si128(
1264 _mm_blendv_pd(_mm_castsi128_pd(a.reg()), _mm_castsi128_pd(b.reg()), _mm_castsi128_pd(mask.reg())))};
1265 } else if constexpr (is_i32x4 or is_u32x4) {
1266 return numeric_array{_mm_castps_si128(
1267 _mm_blendv_ps(_mm_castsi128_ps(a.reg()), _mm_castsi128_ps(b.reg()), _mm_castsi128_ps(mask.reg())))};
1268 } else if constexpr (is_i8x16 or is_u8x16) {
1269 return numeric_array{_mm_blendv_epi8(a.reg(), b.reg(), mask.reg())};
1270 }
1271#endif
1272 }
1273
1274 auto r = numeric_array{};
1275 for (std::size_t i = 0; i != N; ++i) {
1276 r[i] = mask[i] != T{0} ? b[i] : a[i];
1277 }
1278 return r;
1279 }
1280
1285 template<std::size_t Mask>
1286 [[nodiscard]] friend constexpr numeric_array neg(numeric_array rhs) noexcept
1287 {
1288 return blend<Mask>(rhs, -rhs);
1289 }
1290
1291 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const &rhs) noexcept
1292 {
1293 return T{0} - rhs;
1294 }
1295
1296 [[nodiscard]] friend constexpr numeric_array abs(numeric_array const &rhs) noexcept
1297 {
1298 if (not std::is_constant_evaluated()) {
1299#if defined(HI_HAS_AVX2)
1300 if constexpr (is_i32x8) {
1301 return numeric_array{_mm256_abs_epi32(rhs.reg())};
1302 } else if constexpr (is_i16x16) {
1303 return numeric_array{_mm256_abs_epi16(rhs.reg())};
1304 } else if constexpr (is_i8x32) {
1305 return numeric_array{_mm256_abs_epi8(rhs.reg())};
1306 }
1307#endif
1308#if defined(HI_HAS_SSSE3)
1309 if constexpr (is_i32x4) {
1310 return numeric_array{_mm_abs_epi32(rhs.reg())};
1311 } else if constexpr (is_i16x8) {
1312 return numeric_array{_mm_abs_epi16(rhs.reg())};
1313 } else if constexpr (is_i8x16) {
1314 return numeric_array{_mm_abs_epi8(rhs.reg())};
1315 }
1316#endif
1317#if defined(HI_HAS_SSE2)
1318 if constexpr (is_f64x2) {
1319 return numeric_array{_mm_castsi128_ps(_mm_srli_epi64(_mm_slli_epi64(_mm_castpd_si128(rhs.reg()), 1), 1))};
1320 } else if constexpr (is_f32x4) {
1321 return numeric_array{_mm_castsi128_ps(_mm_srli_epi32(_mm_slli_epi32(_mm_castps_si128(rhs.reg()), 1), 1))};
1322 }
1323#endif
1324 }
1325
1326 return max(rhs, -rhs);
1327 }
1328
1329 [[nodiscard]] friend constexpr numeric_array rcp(numeric_array const &rhs) noexcept
1330 {
1331 if (not std::is_constant_evaluated()) {
1332#if defined(HI_HAS_AVX)
1333 if constexpr (is_f32x8) {
1334 return numeric_array{_mm256_rcp_ps(rhs.reg())};
1335 }
1336#endif
1337#if defined(HI_HAS_SSE)
1338 if constexpr (is_f32x4) {
1339 return numeric_array{_mm_rcp_ps(rhs.reg())};
1340 }
1341#endif
1342 }
1343
1344 return T{1} / rhs;
1345 }
1346
1347 [[nodiscard]] friend constexpr numeric_array sqrt(numeric_array const &rhs) noexcept
1348 {
1349 if (not std::is_constant_evaluated()) {
1350#if defined(HI_HAS_AVX)
1351 if constexpr (is_f64x4) {
1352 return numeric_array{_mm256_sqrt_pd(rhs.reg())};
1353 } else if constexpr (is_f32x8) {
1354 return numeric_array{_mm256_sqrt_ps(rhs.reg())};
1355 }
1356#endif
1357#if defined(HI_HAS_SSE2)
1358 if constexpr (is_f64x2) {
1359 return numeric_array{_mm_sqrt_pd(rhs.reg())};
1360 }
1361#endif
1362#if defined(HI_HAS_SSE)
1363 if constexpr (is_f32x4) {
1364 return numeric_array{_mm_sqrt_ps(rhs.reg())};
1365 }
1366#endif
1367 }
1368
1369 auto r = numeric_array{};
1370 for (std::size_t i = 0; i != N; ++i) {
1371 r[i] = std::sqrt(rhs.v[i]);
1372 }
1373 return r;
1374 }
1375
1376 [[nodiscard]] friend constexpr numeric_array rcp_sqrt(numeric_array const &rhs) noexcept
1377 {
1378 if (not std::is_constant_evaluated()) {
1379#if defined(HI_HAS_AVX)
1380 if constexpr (is_f32x8) {
1381 return numeric_array{_mm256_rsqrt_ps(rhs.reg())};
1382 }
1383#endif
1384#if defined(HI_HAS_SSE)
1385 if constexpr (is_f32x4) {
1386 return numeric_array{_mm_rsqrt_ps(rhs.reg())};
1387 }
1388#endif
1389 }
1390
1391 return rcp(sqrt(rhs));
1392 }
1393
1394 [[nodiscard]] friend constexpr numeric_array floor(numeric_array const &rhs) noexcept
1395 requires(std::is_floating_point_v<value_type>)
1396 {
1397 if (not std::is_constant_evaluated()) {
1398#if defined(HI_HAS_AVX)
1399 if constexpr (is_f64x4) {
1400 return numeric_array{_mm256_floor_pd(rhs.reg())};
1401 } else if constexpr (is_f32x8) {
1402 return numeric_array{_mm256_floor_ps(rhs.reg())};
1403 }
1404#endif
1405#if defined(HI_HAS_SSE4_1)
1406 if constexpr (is_f64x2) {
1407 return numeric_array{_mm_floor_pd(rhs.reg())};
1408 } else if constexpr (is_f32x4) {
1409 return numeric_array{_mm_floor_ps(rhs.reg())};
1410 }
1411#endif
1412 }
1413
1414 auto r = numeric_array{};
1415 for (std::size_t i = 0; i != N; ++i) {
1416 r[i] = std::floor(rhs.v[i]);
1417 }
1418 return r;
1419 }
1420
1421 [[nodiscard]] friend constexpr numeric_array ceil(numeric_array const &rhs) noexcept
1422 requires(std::is_floating_point_v<value_type>)
1423 {
1424 if (not std::is_constant_evaluated()) {
1425#if defined(HI_HAS_AVX)
1426 if constexpr (is_f64x4) {
1427 return numeric_array{_mm256_ceil_pd(rhs.reg())};
1428 } else if constexpr (is_f32x8) {
1429 return numeric_array{_mm256_ceil_ps(rhs.reg())};
1430 }
1431#endif
1432#if defined(HI_HAS_SSE4_1)
1433 if constexpr (is_f64x2) {
1434 return numeric_array{_mm_ceil_pd(rhs.reg())};
1435 } else if constexpr (is_f32x4) {
1436 return numeric_array{_mm_ceil_ps(rhs.reg())};
1437 }
1438#endif
1439 }
1440
1441 auto r = numeric_array{};
1442 for (std::size_t i = 0; i != N; ++i) {
1443 r[i] = std::ceil(rhs.v[i]);
1444 }
1445 return r;
1446 }
1447
1448 [[nodiscard]] friend constexpr numeric_array round(numeric_array const &rhs) noexcept
1449 requires(std::is_floating_point_v<value_type>)
1450 {
1451 if (not std::is_constant_evaluated()) {
1452#if defined(HI_HAS_AVX)
1453 if constexpr (is_f64x4) {
1454 return numeric_array{_mm256_round_pd(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1455 } else if constexpr (is_f32x8) {
1456 return numeric_array{_mm256_round_ps(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1457 }
1458#endif
1459#if defined(HI_HAS_SSE4_1)
1460 if constexpr (is_f64x2) {
1461 return numeric_array{_mm_round_pd(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1462 } else if constexpr (is_f32x4) {
1463 return numeric_array{_mm_round_ps(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1464 }
1465#endif
1466 }
1467
1468 auto r = numeric_array{};
1469 for (std::size_t i = 0; i != N; ++i) {
1470 r[i] = std::round(rhs.v[i]);
1471 }
1472 return r;
1473 }
1474
1482 template<std::size_t Mask>
1483 [[nodiscard]] hi_force_inline friend constexpr T dot(numeric_array const &lhs, numeric_array const &rhs) noexcept
1484 {
1485 if (not std::is_constant_evaluated()) {
1486#if defined(HI_HAS_SSE4_1)
1487 if constexpr (is_f64x2) {
1488 return std::bit_cast<double>(_mm_extract_epi64(_mm_dp_pd(lhs.reg(), rhs.reg(), (Mask << 4) | 0xf), 0));
1489 } else if constexpr (is_f32x4) {
1490 return std::bit_cast<float>(_mm_extract_ps(_mm_dp_ps(lhs.reg(), rhs.reg(), (Mask << 4) | 0xf), 0));
1491 }
1492#endif
1493 }
1494
1495 auto r = T{};
1496 for (std::size_t i = 0; i != N; ++i) {
1497 if (static_cast<bool>(Mask & (1_uz << i))) {
1498 r += lhs.v[i] * rhs.v[i];
1499 }
1500 }
1501 return r;
1502 }
1503
1511 template<std::size_t Mask>
1512 [[nodiscard]] friend constexpr T hypot(numeric_array const &rhs) noexcept
1513 {
1514 return std::sqrt(dot<Mask>(rhs, rhs));
1515 }
1516
1524 template<std::size_t Mask>
1525 [[nodiscard]] hi_force_inline friend constexpr T squared_hypot(numeric_array const& rhs) noexcept
1526 {
1527 return dot<Mask>(rhs, rhs);
1528 }
1529
1536 template<std::size_t Mask>
1537 [[nodiscard]] friend constexpr T rcp_hypot(numeric_array const &rhs) noexcept
1538 {
1539 if (not std::is_constant_evaluated()) {
1540#if defined(HI_HAS_SSE4_1)
1541 if constexpr (is_f32x4) {
1542 return std::bit_cast<float>(_mm_extract_ps(_mm_rsqrt_ps(_mm_dp_ps(rhs.reg(), rhs.reg(), (Mask << 4) | 0xf)), 0));
1543 }
1544#endif
1545 }
1546
1547 return 1.0f / hypot<Mask>(rhs);
1548 }
1549
1558 template<std::size_t Mask>
1559 [[nodiscard]] friend constexpr numeric_array normalize(numeric_array const &rhs) noexcept
1560 {
1561 hi_axiom(rhs.is_vector());
1562
1563 if (not std::is_constant_evaluated()) {
1564#if defined(HI_HAS_SSE4_1)
1565 if constexpr (is_f32x4) {
1566 hilet rhs_ = rhs.reg();
1567 hilet tmp = _mm_mul_ps(_mm_rsqrt_ps(_mm_dp_ps(rhs_, rhs_, (Mask << 4) | 0xf)), rhs_);
1568 return numeric_array{_mm_insert_ps(tmp, tmp, ~Mask & 0xf)};
1569 }
1570#endif
1571 }
1572
1573 hilet rcp_hypot_ = rcp_hypot<Mask>(rhs);
1574
1575 auto r = numeric_array{};
1576 for (std::size_t i = 0; i != N; ++i) {
1577 if (static_cast<bool>(Mask & (1_uz << i))) {
1578 r.v[i] = rhs.v[i] * rcp_hypot_;
1579 }
1580 }
1581 return r;
1582 }
1583
1584 [[nodiscard]] friend constexpr std::size_t eq(numeric_array const &lhs, numeric_array const &rhs) noexcept
1585 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1586 {
1587 if (not std::is_constant_evaluated()) {
1588#if defined(HI_HAS_AVX2)
1589 if constexpr (is_i64x4 or is_u64x4) {
1590 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(lhs.reg(), rhs.reg()))));
1591 } else if constexpr (is_i32x8 or is_u32x8) {
1592 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(lhs.reg(), rhs.reg()))));
1593 } else if constexpr (is_i8x32 or is_u8x32) {
1594 return static_cast<std::size_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(lhs.reg(), rhs.reg())));
1595 }
1596#endif
1597#if defined(HI_HAS_AVX)
1598 if constexpr (is_f64x4) {
1599 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_EQ_OQ)));
1600 } else if constexpr (is_f32x8) {
1601 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_EQ_OQ)));
1602 }
1603#endif
1604#if defined(HI_HAS_SSE4_1)
1605 if constexpr (is_i64x2 or is_u64x2) {
1606 return static_cast<std::size_t>(_mm_movemask_pd(_mm_castsi128_pd(_mm_cmpeq_epi64(lhs.reg(), rhs.reg()))));
1607 }
1608#endif
1609#if defined(HI_HAS_SSE2)
1610 if constexpr (is_f64x2) {
1611 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpeq_pd(lhs.reg(), rhs.reg())));
1612 } else if constexpr (is_i32x4 or is_u32x4) {
1613 return static_cast<std::size_t>(_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs.reg(), rhs.reg()))));
1614 } else if constexpr (is_i8x16 or is_u8x16) {
1615 return static_cast<std::size_t>(_mm_movemask_epi8(_mm_cmpeq_epi8(lhs.reg(), rhs.reg())));
1616 }
1617#endif
1618#if defined(HI_HAS_SSE)
1619 if constexpr (is_f32x4) {
1620 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpeq_ps(lhs.reg(), rhs.reg())));
1621 }
1622#endif
1623 }
1624
1625 std::size_t r = 0;
1626 for (std::size_t i = 0; i != N; ++i) {
1627 r |= static_cast<std::size_t>(lhs.v[i] == rhs.v[i]) << i;
1628 }
1629 return r;
1630 }
1631
1632 [[nodiscard]] friend constexpr std::size_t ne(numeric_array const &lhs, numeric_array const &rhs) noexcept
1633 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1634 {
1635 if (not std::is_constant_evaluated()) {
1636#if defined(HI_HAS_AVX)
1637 if constexpr (is_f64x4) {
1638 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_NEQ_OQ)));
1639 } else if constexpr (is_f32x8) {
1640 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_NEQ_OQ)));
1641 }
1642#endif
1643#if defined(HI_HAS_SSE2)
1644 if constexpr (is_f64x2) {
1645 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpneq_pd(lhs.reg(), rhs.reg())));
1646 }
1647#endif
1648#if defined(HI_HAS_SSE)
1649 if constexpr (is_f32x4) {
1650 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpneq_ps(lhs.reg(), rhs.reg())));
1651 }
1652#endif
1653 }
1654
1655 constexpr std::size_t not_mask = (1 << N) - 1;
1656 return eq(lhs, rhs) ^ not_mask;
1657 }
1658
1659 [[nodiscard]] friend constexpr std::size_t gt(numeric_array const &lhs, numeric_array const &rhs) noexcept
1660 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1661 {
1662 if (not std::is_constant_evaluated()) {
1663#if defined(HI_HAS_AVX2)
1664 if constexpr (is_i64x4) {
1665 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpgt_epi64(lhs.reg(), rhs.reg()))));
1666 } else if constexpr (is_i32x8) {
1667 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(lhs.reg(), rhs.reg()))));
1668 } else if constexpr (is_i8x32) {
1669 return static_cast<std::size_t>(_mm256_movemask_epi8(_mm256_cmpgt_epi8(lhs.reg(), rhs.reg())));
1670 }
1671#endif
1672#if defined(HI_HAS_AVX)
1673 if constexpr (is_f64x4) {
1674 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_GT_OQ)));
1675 } else if constexpr (is_f32x8) {
1676 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_GT_OQ)));
1677 }
1678#endif
1679#if defined(HI_HAS_SSE4_1)
1680 if constexpr (is_i64x2) {
1681 return static_cast<std::size_t>(_mm_movemask_pd(_mm_castsi128_pd(_mm_cmpgt_epi64(lhs.reg(), rhs.reg()))));
1682 }
1683#endif
1684#if defined(HI_HAS_SSE2)
1685 if constexpr (is_f64x2) {
1686 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpgt_pd(lhs.reg(), rhs.reg())));
1687 } else if constexpr (is_i32x4) {
1688 return static_cast<std::size_t>(_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(lhs.reg(), rhs.reg()))));
1689 } else if constexpr (is_i8x16) {
1690 return static_cast<std::size_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(lhs.reg(), rhs.reg())));
1691 }
1692#endif
1693#if defined(HI_HAS_SSE)
1694 if constexpr (is_f32x4) {
1695 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpgt_ps(lhs.reg(), rhs.reg())));
1696 }
1697#endif
1698 }
1699
1700 unsigned int r = 0;
1701 for (std::size_t i = 0; i != N; ++i) {
1702 r |= static_cast<std::size_t>(lhs.v[i] > rhs.v[i]) << i;
1703 }
1704 return r;
1705 }
1706
1707 [[nodiscard]] friend constexpr std::size_t lt(numeric_array const &lhs, numeric_array const &rhs) noexcept
1708 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1709 {
1710 if (not std::is_constant_evaluated()) {
1711#if defined(HI_HAS_AVX)
1712 if constexpr (is_f64x4) {
1713 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_LT_OQ)));
1714 } else if constexpr (is_f32x8) {
1715 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_LT_OQ)));
1716 }
1717#endif
1718#if defined(HI_HAS_SSE2)
1719 if constexpr (is_f64x2) {
1720 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmplt_pd(lhs.reg(), rhs.reg())));
1721 } else if constexpr (is_i32x4) {
1722 return static_cast<std::size_t>(_mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(lhs.reg(), rhs.reg()))));
1723 } else if constexpr (is_i8x16) {
1724 return static_cast<std::size_t>(_mm_movemask_epi8(_mm_cmplt_epi8(lhs.reg(), rhs.reg())));
1725 }
1726#endif
1727#if defined(HI_HAS_SSE)
1728 if constexpr (is_f32x4) {
1729 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmplt_ps(lhs.reg(), rhs.reg())));
1730 }
1731#endif
1732 }
1733
1734 // gt() and eq() has best x64 support.
1735 return gt(rhs, lhs);
1736 }
1737
1738 [[nodiscard]] friend constexpr std::size_t ge(numeric_array const &lhs, numeric_array const &rhs) noexcept
1739 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1740 {
1741 if (not std::is_constant_evaluated()) {
1742#if defined(HI_HAS_AVX)
1743 if constexpr (is_f64x4) {
1744 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_GE_OQ)));
1745 } else if constexpr (is_f32x8) {
1746 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_GE_OQ)));
1747 }
1748#endif
1749#if defined(HI_HAS_SSE2)
1750 if constexpr (is_f64x2) {
1751 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmpge_pd(lhs.reg(), rhs.reg())));
1752 }
1753#endif
1754#if defined(HI_HAS_SSE)
1755 if constexpr (is_f32x4) {
1756 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmpge_ps(lhs.reg(), rhs.reg())));
1757 }
1758#endif
1759 }
1760
1761 // gt() and eq() has best x64 support.
1762 return gt(lhs, rhs) | eq(lhs, rhs);
1763 }
1764
1765 [[nodiscard]] friend constexpr std::size_t le(numeric_array const &lhs, numeric_array const &rhs) noexcept
1766 requires(N <= sizeof(std::size_t) * CHAR_BIT)
1767 {
1768 if (not std::is_constant_evaluated()) {
1769#if defined(HI_HAS_AVX)
1770 if constexpr (is_f64x4) {
1771 return static_cast<std::size_t>(_mm256_movemask_pd(_mm256_cmp_pd(lhs.reg(), rhs.reg(), _CMP_LE_OQ)));
1772 } else if constexpr (is_f32x8) {
1773 return static_cast<std::size_t>(_mm256_movemask_ps(_mm256_cmp_ps(lhs.reg(), rhs.reg(), _CMP_LE_OQ)));
1774 }
1775#endif
1776#if defined(HI_HAS_SSE2)
1777 if constexpr (is_f64x2) {
1778 return static_cast<std::size_t>(_mm_movemask_pd(_mm_cmple_pd(lhs.reg(), rhs.reg())));
1779 }
1780#endif
1781#if defined(HI_HAS_SSE)
1782 if constexpr (is_f32x4) {
1783 return static_cast<std::size_t>(_mm_movemask_ps(_mm_cmple_ps(lhs.reg(), rhs.reg())));
1784 }
1785#endif
1786 }
1787
1788 // gt() and eq() has best x64 support.
1789 return gt(rhs, lhs) | eq(rhs, lhs);
1790 }
1791
1792 [[nodiscard]] friend constexpr numeric_array gt_mask(numeric_array const &lhs, numeric_array const &rhs) noexcept
1793 {
1794 if (not std::is_constant_evaluated()) {
1795#if defined(HI_HAS_SSE4_2)
1796 if constexpr (is_i64x2) {
1797 return numeric_array{_mm_cmpgt_epi64(lhs.reg(), rhs.reg())};
1798 }
1799#endif
1800#if defined(HI_HAS_SSE2)
1801 if constexpr (is_i32x4) {
1802 return numeric_array{_mm_cmpgt_epi32(lhs.reg(), rhs.reg())};
1803 } else if constexpr (is_i16x8) {
1804 return numeric_array{_mm_cmpgt_epi16(lhs.reg(), rhs.reg())};
1805 } else if constexpr (is_i8x16) {
1806 return numeric_array{_mm_cmpgt_epi8(lhs.reg(), rhs.reg())};
1807 }
1808#endif
1809#if defined(HI_HAS_SSE)
1810 if constexpr (is_f32x4) {
1811 return numeric_array{_mm_cmpgt_ps(lhs.reg(), rhs.reg())};
1812 }
1813#endif
1814 }
1815
1816 using uint_type = make_uintxx_t<sizeof(T) * CHAR_BIT>;
1817 constexpr auto ones = std::bit_cast<T>(~uint_type{0});
1818
1819 auto r = numeric_array{};
1820 for (std::size_t i = 0; i != N; ++i) {
1821 r[i] = lhs.v[i] > rhs.v[i] ? ones : T{0};
1822 }
1823 return r;
1824 }
1825
1826 [[nodiscard]] friend constexpr bool operator==(numeric_array const &lhs, numeric_array const &rhs) noexcept
1827 {
1828 return not ne(lhs, rhs);
1829 }
1830
1831 [[nodiscard]] friend constexpr numeric_array operator<<(numeric_array const &lhs, unsigned int rhs) noexcept
1832 {
1833 if (not std::is_constant_evaluated()) {
1834#if defined(HI_HAS_AVX2)
1835 if constexpr (is_f64x4) {
1836 return numeric_array{_mm256_castsi256_pd(_mm256_slli_epi64(_mm256_castpd_si256(lhs.reg()), rhs))};
1837 } else if constexpr (is_f32x8) {
1838 return numeric_array{_mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(lhs.reg()), rhs))};
1839 } else if constexpr (is_i64x4 or is_u64x4) {
1840 return numeric_array{_mm256_slli_epi64(lhs.reg(), rhs)};
1841 } else if constexpr (is_i32x8 or is_u32x8) {
1842 return numeric_array{_mm256_slli_epi32(lhs.reg(), rhs)};
1843 } else if constexpr (is_i16x16 or is_u16x16) {
1844 return numeric_array{_mm256_slli_epi16(lhs.reg(), rhs)};
1845 }
1846#endif
1847#if defined(HI_HAS_SSE2)
1848 if constexpr (is_f64x2) {
1849 return numeric_array{_mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(lhs.reg()), rhs))};
1850 } else if constexpr (is_f32x4) {
1851 return numeric_array{_mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(lhs.reg()), rhs))};
1852 } else if constexpr (is_i64x2 or is_u64x2) {
1853 return numeric_array{_mm_slli_epi64(lhs.reg(), rhs)};
1854 } else if constexpr (is_i32x4 or is_u32x4) {
1855 return numeric_array{_mm_slli_epi32(lhs.reg(), rhs)};
1856 } else if constexpr (is_i16x8 or is_u16x8) {
1857 return numeric_array{_mm_slli_epi16(lhs.reg(), rhs)};
1858 }
1859#endif
1860 }
1861
1862 auto r = numeric_array{};
1863 for (std::size_t i = 0; i != N; ++i) {
1864 r.v[i] = lhs.v[i] << rhs;
1865 }
1866 return r;
1867 }
1868
1869 [[nodiscard]] friend constexpr numeric_array operator>>(numeric_array const &lhs, unsigned int rhs) noexcept
1870 {
1871 if (not std::is_constant_evaluated()) {
1872#if defined(HI_HAS_AVX2)
1873 if constexpr (is_f64x4) {
1874 return numeric_array{_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_castpd_si256(lhs.reg()), rhs))};
1875 } else if constexpr (is_f32x8) {
1876 return numeric_array{_mm256_castsi256_ps(_mm256_srli_epi32(_mm256_castps_si256(lhs.reg()), rhs))};
1877 } else if constexpr (is_u64x4) {
1878 return numeric_array{_mm256_srli_epi64(lhs.reg(), rhs)};
1879 } else if constexpr (is_i32x8) {
1880 return numeric_array{_mm256_srai_epi32(lhs.reg(), rhs)};
1881 } else if constexpr (is_u32x8) {
1882 return numeric_array{_mm256_srli_epi32(lhs.reg(), rhs)};
1883 } else if constexpr (is_i16x16) {
1884 return numeric_array{_mm256_srai_epi16(lhs.reg(), rhs)};
1885 } else if constexpr (is_u16x16) {
1886 return numeric_array{_mm256_srli_epi16(lhs.reg(), rhs)};
1887 }
1888#endif
1889#if defined(HI_HAS_SSE2)
1890 if constexpr (is_f64x2) {
1891 return numeric_array{_mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(lhs.reg()), rhs))};
1892 } else if constexpr (is_f32x4) {
1893 return numeric_array{_mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(lhs.reg()), rhs))};
1894 } else if constexpr (is_u64x2) {
1895 return numeric_array{_mm_srli_epi64(lhs.reg(), rhs)};
1896 } else if constexpr (is_i32x4) {
1897 return numeric_array{_mm_srai_epi32(lhs.reg(), rhs)};
1898 } else if constexpr (is_u32x4) {
1899 return numeric_array{_mm_srli_epi32(lhs.reg(), rhs)};
1900 } else if constexpr (is_i16x8) {
1901 return numeric_array{_mm_srai_epi16(lhs.reg(), rhs)};
1902 } else if constexpr (is_u16x8) {
1903 return numeric_array{_mm_srli_epi16(lhs.reg(), rhs)};
1904 }
1905#endif
1906 }
1907
1908 auto r = numeric_array{};
1909 for (std::size_t i = 0; i != N; ++i) {
1910 r.v[i] = lhs.v[i] >> rhs;
1911 }
1912 return r;
1913 }
1914
1915 [[nodiscard]] friend constexpr numeric_array operator|(numeric_array const &lhs, numeric_array const &rhs) noexcept
1916 {
1917 if (not std::is_constant_evaluated()) {
1918#if defined(HI_HAS_AVX2)
1919 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
1920 return numeric_array{_mm256_or_si256(lhs.reg(), rhs.reg())};
1921 }
1922#endif
1923#if defined(HI_HAS_AVX)
1924 if constexpr (is_f64x4) {
1925 return numeric_array{_mm256_or_pd(lhs.reg(), rhs.reg())};
1926 } else if constexpr (is_f32x8) {
1927 return numeric_array{_mm256_or_ps(lhs.reg(), rhs.reg())};
1928 } else if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
1929 return numeric_array{
1930 _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg())))};
1931 }
1932#endif
1933#if defined(HI_HAS_SSE2)
1934 if constexpr (is_f64x2) {
1935 return numeric_array{_mm_or_pd(lhs.reg(), rhs.reg())};
1936 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
1937 return numeric_array{_mm_or_si128(lhs.reg(), rhs.reg())};
1938 }
1939#endif
1940#if defined(HI_HAS_SSE)
1941 if constexpr (is_f64x2) {
1942 return numeric_array{_mm_castps_pd(_mm_or_ps(_mm_castps_ps(lhs.reg()), _mm_castps_ps(rhs.reg())))};
1943
1944 } else if constexpr (is_f32x4) {
1945 return numeric_array{_mm_or_ps(lhs.reg(), rhs.reg())};
1946
1947 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
1948 return numeric_array{_mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg())))};
1949 }
1950#endif
1951 }
1952
1953 using uint_type = make_uintxx_t<sizeof(T) * CHAR_BIT>;
1954
1955 auto r = numeric_array{};
1956 for (std::size_t i = 0; i != N; ++i) {
1957 r.v[i] =
1958 std::bit_cast<T>(static_cast<uint_type>(std::bit_cast<uint_type>(lhs.v[i]) | std::bit_cast<uint_type>(rhs.v[i])));
1959 }
1960 return r;
1961 }
1962
1963 [[nodiscard]] friend constexpr numeric_array operator|(numeric_array const &lhs, T const &rhs) noexcept
1964 {
1965 return lhs | broadcast(rhs);
1966 }
1967
1968 [[nodiscard]] friend constexpr numeric_array operator|(T const &lhs, numeric_array const &rhs) noexcept
1969 {
1970 return broadcast(lhs) | rhs;
1971 }
1972
1973 [[nodiscard]] friend constexpr numeric_array operator&(numeric_array const &lhs, numeric_array const &rhs) noexcept
1974 {
1975 if (not std::is_constant_evaluated()) {
1976#if defined(HI_HAS_AVX2)
1977 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
1978 return numeric_array{_mm256_and_si256(lhs.reg(), rhs.reg())};
1979 }
1980#endif
1981#if defined(HI_HAS_AVX)
1982 if constexpr (is_f64x4) {
1983 return numeric_array{_mm256_and_pd(lhs.reg(), rhs.reg())};
1984 } else if constexpr (is_f32x8) {
1985 return numeric_array{_mm256_and_ps(lhs.reg(), rhs.reg())};
1986 } else if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
1987 return numeric_array{
1988 _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg())))};
1989 }
1990#endif
1991#if defined(HI_HAS_SSE2)
1992 if constexpr (is_f64x2) {
1993 return numeric_array{_mm_and_pd(lhs.reg(), rhs.reg())};
1994 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
1995 return numeric_array{_mm_and_si128(lhs.reg(), rhs.reg())};
1996 }
1997#endif
1998#if defined(HI_HAS_SSE)
1999 if constexpr (is_f64x2) {
2000 return numeric_array{_mm_castps_pd(_mm_and_ps(_mm_castps_ps(lhs.reg()), _mm_castps_ps(rhs.reg())))};
2001
2002 } else if constexpr (is_f32x4) {
2003 return numeric_array{_mm_and_ps(lhs.reg(), rhs.reg())};
2004
2005 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2006 return numeric_array{_mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg())))};
2007 }
2008#endif
2009 }
2010
2011 auto r = numeric_array{};
2012 for (std::size_t i = 0; i != N; ++i) {
2013 r.v[i] = lhs.v[i] & rhs.v[i];
2014 }
2015 return r;
2016 }
2017
2018 [[nodiscard]] friend constexpr numeric_array operator&(numeric_array const &lhs, T const &rhs) noexcept
2019 {
2020 return lhs & broadcast(rhs);
2021 }
2022
2023 [[nodiscard]] friend constexpr numeric_array operator&(T const &lhs, numeric_array const &rhs) noexcept
2024 {
2025 return broadcast(lhs) & rhs;
2026 }
2027
2028 [[nodiscard]] friend constexpr numeric_array operator^(numeric_array const &lhs, numeric_array const &rhs) noexcept
2029 {
2030 if (not std::is_constant_evaluated()) {
2031#if defined(HI_HAS_AVX2)
2032 if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2033 return numeric_array{_mm256_xor_si256(lhs.reg(), rhs.reg())};
2034 }
2035#endif
2036#if defined(HI_HAS_AVX)
2037 if constexpr (is_f64x4) {
2038 return numeric_array{_mm256_xor_pd(lhs.reg(), rhs.reg())};
2039 } else if constexpr (is_f32x8) {
2040 return numeric_array{_mm256_xor_ps(lhs.reg(), rhs.reg())};
2041 } else if constexpr (is_i64x4 or is_u64x4 or is_i32x8 or is_u32x8 or is_i16x8 or is_u16x8 or is_i8x32 or is_u8x32) {
2042 return numeric_array{
2043 _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(lhs.reg()), _mm256_castsi256_ps(rhs.reg())))};
2044 }
2045#endif
2046#if defined(HI_HAS_SSE2)
2047 if constexpr (is_f64x2) {
2048 return numeric_array{_mm_xor_pd(lhs.reg(), rhs.reg())};
2049 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2050 return numeric_array{_mm_xor_si128(lhs.reg(), rhs.reg())};
2051 }
2052#endif
2053#if defined(HI_HAS_SSE)
2054 if constexpr (is_f64x2) {
2055 return numeric_array{_mm_castps_pd(_mm_xor_ps(_mm_castps_ps(lhs.reg()), _mm_castps_ps(rhs.reg())))};
2056
2057 } else if constexpr (is_f32x4) {
2058 return numeric_array{_mm_xor_ps(lhs.reg(), rhs.reg())};
2059
2060 } else if constexpr (is_i64x2 or is_u64x2 or is_i32x4 or is_u32x4 or is_i16x8 or is_u16x8 or is_i8x16 or is_i8x16) {
2061 return numeric_array{_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg())))};
2062 }
2063#endif
2064 }
2065
2066 auto r = numeric_array{};
2067 for (std::size_t i = 0; i != N; ++i) {
2068 r.v[i] = lhs.v[i] ^ rhs.v[i];
2069 }
2070 return r;
2071 }
2072
2073 [[nodiscard]] friend constexpr numeric_array operator^(numeric_array const &lhs, T const &rhs) noexcept
2074 {
2075 return lhs ^ broadcast(rhs);
2076 }
2077
2078 [[nodiscard]] friend constexpr numeric_array operator^(T const &lhs, numeric_array const &rhs) noexcept
2079 {
2080 return broadcast(lhs) ^ rhs;
2081 }
2082
2083 [[nodiscard]] friend constexpr numeric_array operator+(numeric_array const &lhs, numeric_array const &rhs) noexcept
2084 {
2085 if (not std::is_constant_evaluated()) {
2086#if defined(HI_HAS_AVX2)
2087 if constexpr (is_i64x4 or is_u64x4) {
2088 return numeric_array{_mm256_add_epi64(lhs.reg(), rhs.reg())};
2089 } else if constexpr (is_i32x8 or is_u32x8) {
2090 return numeric_array{_mm256_add_epi32(lhs.reg(), rhs.reg())};
2091 } else if constexpr (is_i16x16 or is_u16x16) {
2092 return numeric_array{_mm256_add_epi16(lhs.reg(), rhs.reg())};
2093 } else if constexpr (is_i8x32 or is_u8x32) {
2094 return numeric_array{_mm256_add_epi8(lhs.reg(), rhs.reg())};
2095 }
2096#endif
2097#if defined(HI_HAS_AVX)
2098 if constexpr (is_f64x4) {
2099 return numeric_array{_mm256_add_pd(lhs.reg(), rhs.reg())};
2100 } else if constexpr (is_f32x8) {
2101 return numeric_array{_mm256_add_ps(lhs.reg(), rhs.reg())};
2102 }
2103#endif
2104#if defined(HI_HAS_SSE2)
2105 if constexpr (is_f64x2) {
2106 return numeric_array{_mm_add_pd(lhs.reg(), rhs.reg())};
2107 } else if constexpr (is_i64x2 or is_u64x2) {
2108 return numeric_array{_mm_add_epi64(lhs.reg(), rhs.reg())};
2109 } else if constexpr (is_i32x4 or is_u32x4) {
2110 return numeric_array{_mm_add_epi32(lhs.reg(), rhs.reg())};
2111 } else if constexpr (is_i16x8 or is_u16x8) {
2112 return numeric_array{_mm_add_epi16(lhs.reg(), rhs.reg())};
2113 } else if constexpr (is_i8x16 or is_u8x16) {
2114 return numeric_array{_mm_add_epi8(lhs.reg(), rhs.reg())};
2115 }
2116#endif
2117#if defined(HI_HAS_SSE)
2118 if constexpr (is_f32x4) {
2119 return numeric_array{_mm_add_ps(lhs.reg(), rhs.reg())};
2120 }
2121#endif
2122 }
2123
2124 auto r = numeric_array{};
2125 for (std::size_t i = 0; i != N; ++i) {
2126 r.v[i] = lhs.v[i] + rhs.v[i];
2127 }
2128 return r;
2129 }
2130
2131 [[nodiscard]] friend constexpr numeric_array operator+(numeric_array const &lhs, T const &rhs) noexcept
2132 {
2133 return lhs + broadcast(rhs);
2134 }
2135
2136 [[nodiscard]] friend constexpr numeric_array operator+(T const &lhs, numeric_array const &rhs) noexcept
2137 {
2138 return broadcast(lhs) + rhs;
2139 }
2140
2141 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const &lhs, numeric_array const &rhs) noexcept
2142 {
2143 if (not std::is_constant_evaluated()) {
2144#if defined(HI_HAS_AVX2)
2145 if constexpr (is_i64x4 or is_u64x4) {
2146 return numeric_array{_mm256_sub_epi64(lhs.reg(), rhs.reg())};
2147 } else if constexpr (is_i32x8 or is_u32x8) {
2148 return numeric_array{_mm256_sub_epi32(lhs.reg(), rhs.reg())};
2149 } else if constexpr (is_i16x16 or is_u16x16) {
2150 return numeric_array{_mm256_sub_epi16(lhs.reg(), rhs.reg())};
2151 } else if constexpr (is_i8x32 or is_u8x32) {
2152 return numeric_array{_mm256_sub_epi8(lhs.reg(), rhs.reg())};
2153 }
2154#endif
2155#if defined(HI_HAS_AVX)
2156 if constexpr (is_f64x4) {
2157 return numeric_array{_mm256_sub_pd(lhs.reg(), rhs.reg())};
2158 } else if constexpr (is_f32x8) {
2159 return numeric_array{_mm256_sub_ps(lhs.reg(), rhs.reg())};
2160 }
2161#endif
2162#if defined(HI_HAS_SSE2)
2163 if constexpr (is_f64x2) {
2164 return numeric_array{_mm_sub_pd(lhs.reg(), rhs.reg())};
2165 } else if constexpr (is_i64x2 or is_u64x2) {
2166 return numeric_array{_mm_sub_epi64(lhs.reg(), rhs.reg())};
2167 } else if constexpr (is_i32x4 or is_u32x4) {
2168 return numeric_array{_mm_sub_epi32(lhs.reg(), rhs.reg())};
2169 } else if constexpr (is_i16x8 or is_u16x8) {
2170 return numeric_array{_mm_sub_epi16(lhs.reg(), rhs.reg())};
2171 } else if constexpr (is_i8x16 or is_u8x16) {
2172 return numeric_array{_mm_sub_epi8(lhs.reg(), rhs.reg())};
2173 }
2174#endif
2175#if defined(HI_HAS_SSE)
2176 if constexpr (is_f32x4) {
2177 return numeric_array{_mm_sub_ps(lhs.reg(), rhs.reg())};
2178 }
2179#endif
2180 }
2181
2182 auto r = numeric_array{};
2183 for (std::size_t i = 0; i != N; ++i) {
2184 r.v[i] = lhs.v[i] - rhs.v[i];
2185 }
2186 return r;
2187 }
2188
2189 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const &lhs, T const &rhs) noexcept
2190 {
2191 return lhs - broadcast(rhs);
2192 }
2193
2194 [[nodiscard]] friend constexpr numeric_array operator-(T const &lhs, numeric_array const &rhs) noexcept
2195 {
2196 return broadcast(lhs) - rhs;
2197 }
2198
2199 [[nodiscard]] friend constexpr numeric_array operator*(numeric_array const &lhs, numeric_array const &rhs) noexcept
2200 {
2201 if (not std::is_constant_evaluated()) {
2202#if defined(HI_HAS_AVX2)
2203 if constexpr (is_i32x8) {
2204 return numeric_array{_mm256_mul_epi32(lhs.reg(), rhs.reg())};
2205 } else if constexpr (is_u32x8) {
2206 return numeric_array{_mm256_mul_epu32(lhs.reg(), rhs.reg())};
2207 }
2208#endif
2209#if defined(HI_HAS_AVX)
2210 if constexpr (is_f64x4) {
2211 return numeric_array{_mm256_mul_pd(lhs.reg(), rhs.reg())};
2212 } else if constexpr (is_f32x8) {
2213 return numeric_array{_mm256_mul_ps(lhs.reg(), rhs.reg())};
2214 }
2215#endif
2216#if defined(HI_HAS_SSE4_1)
2217 if constexpr (is_i32x4) {
2218 return numeric_array{_mm_mul_epi32(lhs.reg(), rhs.reg())};
2219 } else if constexpr (is_f16x4) {
2220 return numeric_array{numeric_array<float, 4>{lhs} * numeric_array<float, 4>{rhs}};
2221 }
2222#endif
2223#if defined(HI_HAS_SSE2)
2224 if constexpr (is_f64x2) {
2225 return numeric_array{_mm_mul_pd(lhs.reg(), rhs.reg())};
2226 }
2227#endif
2228#if defined(HI_HAS_SSE)
2229 if constexpr (is_f32x4) {
2230 return numeric_array{_mm_mul_ps(lhs.reg(), rhs.reg())};
2231 }
2232#endif
2233 }
2234
2235 auto r = numeric_array{};
2236 for (std::size_t i = 0; i != N; ++i) {
2237 r.v[i] = lhs.v[i] * rhs.v[i];
2238 }
2239 return r;
2240 }
2241
2242 [[nodiscard]] friend constexpr numeric_array operator*(numeric_array const &lhs, T const &rhs) noexcept
2243 {
2244 return lhs * broadcast(rhs);
2245 }
2246
2247 [[nodiscard]] friend constexpr numeric_array operator*(T const &lhs, numeric_array const &rhs) noexcept
2248 {
2249 return broadcast(lhs) * rhs;
2250 }
2251
2252 [[nodiscard]] friend constexpr numeric_array operator/(numeric_array const &lhs, numeric_array const &rhs) noexcept
2253 {
2254 if (not std::is_constant_evaluated()) {
2255#if defined(HI_HAS_AVX)
2256 if constexpr (is_f64x4) {
2257 return numeric_array{_mm256_div_pd(lhs.reg(), rhs.reg())};
2258 } else if constexpr (is_f32x8) {
2259 return numeric_array{_mm256_div_ps(lhs.reg(), rhs.reg())};
2260 }
2261#endif
2262#if defined(HI_HAS_SSE2)
2263 if constexpr (is_f64x2) {
2264 return numeric_array{_mm_div_pd(lhs.reg(), rhs.reg())};
2265 }
2266#endif
2267#if defined(HI_HAS_SSE)
2268 if constexpr (is_f32x4) {
2269 return numeric_array{_mm_div_ps(lhs.reg(), rhs.reg())};
2270 }
2271#endif
2272 }
2273
2274 auto r = numeric_array{};
2275 for (std::size_t i = 0; i != N; ++i) {
2276 r.v[i] = lhs.v[i] / rhs.v[i];
2277 }
2278 return r;
2279 }
2280
2281 [[nodiscard]] friend constexpr numeric_array operator/(numeric_array const &lhs, T const &rhs) noexcept
2282 {
2283 return lhs / broadcast(rhs);
2284 }
2285
2286 [[nodiscard]] friend constexpr numeric_array operator/(T const &lhs, numeric_array const &rhs) noexcept
2287 {
2288 return broadcast(lhs) / rhs;
2289 }
2290
2291 [[nodiscard]] friend constexpr numeric_array operator%(numeric_array const &lhs, numeric_array const &rhs) noexcept
2292 {
2293 hilet div_result = floor(lhs / rhs);
2294 return lhs - (div_result * rhs);
2295 }
2296
2297 [[nodiscard]] friend constexpr numeric_array operator%(numeric_array const &lhs, T const &rhs) noexcept
2298 {
2299 return lhs % broadcast(rhs);
2300 }
2301
2302 [[nodiscard]] friend constexpr numeric_array operator%(T const &lhs, numeric_array const &rhs) noexcept
2303 {
2304 return broadcast(lhs) % rhs;
2305 }
2306
2307 [[nodiscard]] friend constexpr numeric_array min(numeric_array const &lhs, numeric_array const &rhs) noexcept
2308 {
2309 if (not std::is_constant_evaluated()) {
2310#if defined(HI_HAS_AVX2)
2311 if constexpr (is_i32x8) {
2312 return numeric_array{_mm256_min_epi32(lhs.reg(), rhs.reg())};
2313 } else if constexpr (is_u32x8) {
2314 return numeric_array{_mm256_min_epu32(lhs.reg(), rhs.reg())};
2315 } else if constexpr (is_i16x16) {
2316 return numeric_array{_mm256_min_epi16(lhs.reg(), rhs.reg())};
2317 } else if constexpr (is_u16x16) {
2318 return numeric_array{_mm256_min_epu16(lhs.reg(), rhs.reg())};
2319 } else if constexpr (is_i8x32) {
2320 return numeric_array{_mm256_min_epi8(lhs.reg(), rhs.reg())};
2321 } else if constexpr (is_u8x32) {
2322 return numeric_array{_mm256_min_epu8(lhs.reg(), rhs.reg())};
2323 }
2324#endif
2325#if defined(HI_HAS_AVX)
2326 if constexpr (is_f64x4) {
2327 return numeric_array{_mm256_min_pd(lhs.reg(), rhs.reg())};
2328 } else if constexpr (is_f32x8) {
2329 return numeric_array{_mm256_min_ps(lhs.reg(), rhs.reg())};
2330 }
2331#endif
2332#if defined(HI_HAS_SSE4_1)
2333 if constexpr (is_i32x4) {
2334 return numeric_array{_mm_min_epi32(lhs.reg(), rhs.reg())};
2335 } else if constexpr (is_u32x4) {
2336 return numeric_array{_mm_min_epu32(lhs.reg(), rhs.reg())};
2337 } else if constexpr (is_u16x8) {
2338 return numeric_array{_mm_min_epu16(lhs.reg(), rhs.reg())};
2339 } else if constexpr (is_i8x16) {
2340 return numeric_array{_mm_min_epi8(lhs.reg(), rhs.reg())};
2341 }
2342#endif
2343#if defined(HI_HAS_SSE2)
2344 if constexpr (is_f64x2) {
2345 return numeric_array{_mm_min_pd(lhs.reg(), rhs.reg())};
2346 } else if constexpr (is_i16x8) {
2347 return numeric_array{_mm_min_epi16(lhs.reg(), rhs.reg())};
2348 } else if constexpr (is_u8x16) {
2349 return numeric_array{_mm_min_epu8(lhs.reg(), rhs.reg())};
2350 }
2351#endif
2352#if defined(HI_HAS_SSE)
2353 if constexpr (is_f32x4) {
2354 return numeric_array{_mm_min_ps(lhs.reg(), rhs.reg())};
2355 }
2356#endif
2357 }
2358
2359 auto r = numeric_array{};
2360 for (std::size_t i = 0; i != N; ++i) {
2361 r.v[i] = std::min(lhs.v[i], rhs.v[i]);
2362 }
2363 return r;
2364 }
2365
2366 [[nodiscard]] friend constexpr numeric_array max(numeric_array const &lhs, numeric_array const &rhs) noexcept
2367 {
2368 if (not std::is_constant_evaluated()) {
2369#if defined(HI_HAS_AVX2)
2370 if constexpr (is_i32x8) {
2371 return numeric_array{_mm256_max_epi32(lhs.reg(), rhs.reg())};
2372 } else if constexpr (is_u32x8) {
2373 return numeric_array{_mm256_max_epu32(lhs.reg(), rhs.reg())};
2374 } else if constexpr (is_i16x16) {
2375 return numeric_array{_mm256_max_epi16(lhs.reg(), rhs.reg())};
2376 } else if constexpr (is_u16x16) {
2377 return numeric_array{_mm256_max_epu16(lhs.reg(), rhs.reg())};
2378 } else if constexpr (is_i8x32) {
2379 return numeric_array{_mm256_max_epi8(lhs.reg(), rhs.reg())};
2380 } else if constexpr (is_u8x32) {
2381 return numeric_array{_mm256_max_epu8(lhs.reg(), rhs.reg())};
2382 }
2383#endif
2384#if defined(HI_HAS_AVX)
2385 if constexpr (is_f64x4) {
2386 return numeric_array{_mm256_max_pd(lhs.reg(), rhs.reg())};
2387 } else if constexpr (is_f32x8) {
2388 return numeric_array{_mm256_max_ps(lhs.reg(), rhs.reg())};
2389 }
2390#endif
2391#if defined(HI_HAS_SSE4_1)
2392 if constexpr (is_i32x4) {
2393 return numeric_array{_mm_max_epi32(lhs.reg(), rhs.reg())};
2394 } else if constexpr (is_u32x4) {
2395 return numeric_array{_mm_max_epu32(lhs.reg(), rhs.reg())};
2396 } else if constexpr (is_u16x8) {
2397 return numeric_array{_mm_max_epu16(lhs.reg(), rhs.reg())};
2398 } else if constexpr (is_i8x16) {
2399 return numeric_array{_mm_max_epi8(lhs.reg(), rhs.reg())};
2400 }
2401#endif
2402#if defined(HI_HAS_SSE2)
2403 if constexpr (is_f64x2) {
2404 return numeric_array{_mm_max_pd(lhs.reg(), rhs.reg())};
2405 } else if constexpr (is_i16x8) {
2406 return numeric_array{_mm_max_epi16(lhs.reg(), rhs.reg())};
2407 } else if constexpr (is_u8x16) {
2408 return numeric_array{_mm_max_epu8(lhs.reg(), rhs.reg())};
2409 }
2410#endif
2411#if defined(HI_HAS_SSE)
2412 if constexpr (is_f32x4) {
2413 return numeric_array{_mm_max_ps(lhs.reg(), rhs.reg())};
2414 }
2415#endif
2416 }
2417
2418 auto r = numeric_array{};
2419 for (std::size_t i = 0; i != N; ++i) {
2420 r.v[i] = std::max(lhs.v[i], rhs.v[i]);
2421 }
2422 return r;
2423 }
2424
2425 [[nodiscard]] friend constexpr numeric_array
2426 clamp(numeric_array const &lhs, numeric_array const &low, numeric_array const &high) noexcept
2427 {
2428 return min(max(lhs, low), high);
2429 }
2430
2431 [[nodiscard]] friend constexpr numeric_array hadd(numeric_array const &lhs, numeric_array const &rhs) noexcept
2432 {
2433 if (not std::is_constant_evaluated()) {
2434#if defined(HI_HAS_AVX2)
2435 if constexpr (is_i32x8 or is_u32x8) {
2436 return numeric_array{_mm256_hadd_epi32(lhs.reg(), rhs.reg())};
2437 } else if constexpr (is_i16x16 or is_u16x16) {
2438 return numeric_array{_mm256_hadd_epi16(lhs.reg(), rhs.reg())};
2439 }
2440#endif
2441#if defined(HI_HAS_AVX)
2442 if constexpr (is_f64x4) {
2443 return numeric_array{_mm256_hadd_pd(lhs.reg(), rhs.reg())};
2444 } else if constexpr (is_f32x8) {
2445 return numeric_array{_mm256_hadd_ps(lhs.reg(), rhs.reg())};
2446 }
2447#endif
2448#if defined(HI_HAS_SSSE3)
2449 if constexpr (is_i32x4 or is_u32x4) {
2450 return numeric_array{_mm_hadd_epi32(lhs.reg(), rhs.reg())};
2451 } else if constexpr (is_i16x8 or is_u16x8) {
2452 return numeric_array{_mm_hadd_epi16(lhs.reg(), rhs.reg())};
2453 }
2454#endif
2455#if defined(HI_HAS_SSE3)
2456 if constexpr (is_f64x2) {
2457 return numeric_array{_mm_hadd_pd(lhs.reg(), rhs.reg())};
2458 } else if constexpr (is_f32x4) {
2459 return numeric_array{_mm_hadd_ps(lhs.reg(), rhs.reg())};
2460 }
2461#endif
2462 }
2463
2464 hi_axiom(N % 2 == 0);
2465
2466 auto r = numeric_array{};
2467
2468 std::size_t src_i = 0;
2469 std::size_t dst_i = 0;
2470 while (src_i != N) {
2471 auto tmp = lhs[src_i++];
2472 tmp += lhs[src_i++];
2473 r.v[dst_i++] = tmp;
2474 }
2475
2476 src_i = 0;
2477 while (src_i != N) {
2478 auto tmp = rhs[src_i++];
2479 tmp += rhs[src_i++];
2480 r.v[dst_i++] = tmp;
2481 }
2482 return r;
2483 }
2484
2485 [[nodiscard]] friend constexpr numeric_array hsub(numeric_array const &lhs, numeric_array const &rhs) noexcept
2486 {
2487 if (not std::is_constant_evaluated()) {
2488#if defined(HI_HAS_AVX2)
2489 if constexpr (is_i32x8 or is_u32x8) {
2490 return numeric_array{_mm256_hsub_epi32(lhs.reg(), rhs.reg())};
2491 } else if constexpr (is_i16x16 or is_u16x16) {
2492 return numeric_array{_mm256_hsub_epi16(lhs.reg(), rhs.reg())};
2493 }
2494#endif
2495#if defined(HI_HAS_AVX)
2496 if constexpr (is_f64x4) {
2497 return numeric_array{_mm256_hsub_pd(lhs.reg(), rhs.reg())};
2498 } else if constexpr (is_f32x8) {
2499 return numeric_array{_mm256_hsub_ps(lhs.reg(), rhs.reg())};
2500 }
2501#endif
2502#if defined(HI_HAS_SSSE3)
2503 if constexpr (is_i32x4 or is_u32x4) {
2504 return numeric_array{_mm_hsub_epi32(lhs.reg(), rhs.reg())};
2505 } else if constexpr (is_i16x8 or is_u16x8) {
2506 return numeric_array{_mm_hsub_epi16(lhs.reg(), rhs.reg())};
2507 }
2508#endif
2509#if defined(HI_HAS_SSE3)
2510 if constexpr (is_f64x2) {
2511 return numeric_array{_mm_hsub_pd(lhs.reg(), rhs.reg())};
2512 } else if constexpr (is_f32x4) {
2513 return numeric_array{_mm_hsub_ps(lhs.reg(), rhs.reg())};
2514 }
2515#endif
2516 }
2517
2518 hi_axiom(N % 2 == 0);
2519
2520 auto r = numeric_array{};
2521
2522 std::size_t src_i = 0;
2523 std::size_t dst_i = 0;
2524 while (src_i != N) {
2525 auto tmp = lhs[src_i++];
2526 tmp -= lhs[src_i++];
2527 r.v[dst_i++] = tmp;
2528 }
2529
2530 src_i = 0;
2531 while (src_i != N) {
2532 auto tmp = rhs[src_i++];
2533 tmp -= rhs[src_i++];
2534 r.v[dst_i++] = tmp;
2535 }
2536 return r;
2537 }
2538
2543 template<std::size_t Mask>
2544 [[nodiscard]] friend constexpr numeric_array addsub(numeric_array const &lhs, numeric_array const &rhs) noexcept
2545 {
2546 constexpr std::size_t not_mask = (1 << N) - 1;
2547 return lhs + neg<Mask ^ not_mask>(rhs);
2548 }
2549
2552 [[nodiscard]] friend constexpr numeric_array cross_2D(numeric_array const &rhs) noexcept requires(N >= 2)
2553 {
2554 hi_axiom(rhs.z() == 0.0f && rhs.is_vector());
2555 return numeric_array{-rhs.y(), rhs.x()};
2556 }
2557
2560 [[nodiscard]] friend constexpr numeric_array normal_2D(numeric_array const &rhs) noexcept requires(N >= 2)
2561 {
2562 return normalize<0b0011>(cross_2D(rhs));
2563 }
2564
2568 [[nodiscard]] friend constexpr float cross_2D(numeric_array const &lhs, numeric_array const &rhs) noexcept requires(N >= 2)
2569 {
2570 hilet tmp1 = rhs.yxwz();
2571 hilet tmp2 = lhs * tmp1;
2572 hilet tmp3 = hsub(tmp2, tmp2);
2573 return get<0>(tmp3);
2574 }
2575
2576 // x=a.y*b.z - a.z*b.y
2577 // y=a.z*b.x - a.x*b.z
2578 // z=a.x*b.y - a.y*b.x
2579 // w=a.w*b.w - a.w*b.w
2580 [[nodiscard]] constexpr friend numeric_array cross_3D(numeric_array const &lhs, numeric_array const &rhs) noexcept
2581 requires(N == 4)
2582 {
2583 hilet a_left = lhs.yzxw();
2584 hilet b_left = rhs.zxyw();
2585 hilet left = a_left * b_left;
2586
2587 hilet a_right = lhs.zxyw();
2588 hilet b_right = rhs.yzxw();
2589 hilet right = a_right * b_right;
2590 return left - right;
2591 }
2592
2593 [[nodiscard]] static constexpr numeric_array byte_srl_shuffle_indices(unsigned int rhs) requires(is_i8x16)
2594 {
2595 static_assert(std::endian::native == std::endian::little);
2596
2597 auto r = numeric_array{};
2598 for (auto i = 0; i != 16; ++i) {
2599 if ((i + rhs) < 16) {
2600 r[i] = narrow_cast<int8_t>(i + rhs);
2601 } else {
2602 // Indices set to -1 result in a zero after a byte shuffle.
2603 r[i] = -1;
2604 }
2605 }
2606 return r;
2607 }
2608
2609 [[nodiscard]] static constexpr numeric_array byte_sll_shuffle_indices(unsigned int rhs) requires(is_i8x16)
2610 {
2611 static_assert(std::endian::native == std::endian::little);
2612
2613 auto r = numeric_array{};
2614 for (auto i = 0; i != 16; ++i) {
2615 if ((i - rhs) >= 0) {
2616 r[i] = narrow_cast<int8_t>(i - rhs);
2617 } else {
2618 // Indices set to -1 result in a zero after a byte shuffle.
2619 r[i] = -1;
2620 }
2621 }
2622 return r;
2623 }
2624
2627 [[nodiscard]] friend constexpr numeric_array shuffle(numeric_array const &lhs, numeric_array const &rhs) noexcept
2628 requires(std::is_integral_v<value_type>)
2629 {
2630 if (!std::is_constant_evaluated()) {
2631#if defined(HI_HAS_SSSE3)
2632 if constexpr (is_i8x16 or is_u8x16) {
2633 return numeric_array{_mm_shuffle_epi8(lhs.reg(), rhs.reg())};
2634 }
2635#endif
2636 }
2637
2638 auto r = numeric_array{};
2639 for (std::size_t i = 0; i != N; ++i) {
2640 if (rhs[i] >= 0) {
2641 r[i] = lhs[rhs[i] & 0xf];
2642 } else {
2643 r[i] = 0;
2644 }
2645 }
2646
2647 return r;
2648 }
2649
2652 [[nodiscard]] friend constexpr numeric_array midpoint(numeric_array const &p1, numeric_array const &p2) noexcept
2653 {
2654 hi_axiom(p1.is_point());
2655 hi_axiom(p2.is_point());
2656 return (p1 + p2) * 0.5f;
2657 }
2658
2661 [[nodiscard]] friend constexpr numeric_array reflect_point(numeric_array const &p, numeric_array const anchor) noexcept
2662 {
2663 hi_axiom(p.is_point());
2664 hi_axiom(anchor.is_point());
2665 return anchor - (p - anchor);
2666 }
2667
2668 template<typename... Columns>
2669 [[nodiscard]] friend constexpr std::array<numeric_array, N> transpose(Columns const &...columns) noexcept
2670 {
2671 static_assert(sizeof...(Columns) == N, "Can only transpose square matrices");
2672
2673 if (not std::is_constant_evaluated()) {
2674#if defined(HI_HAS_SSE)
2675 if constexpr (is_f32x4 and sizeof...(Columns) == 4) {
2676 auto tmp = std::array<__m128, N>{columns.reg()...};
2677 _MM_TRANSPOSE4_PS(std::get<0>(tmp), std::get<1>(tmp), std::get<2>(tmp), std::get<3>(tmp));
2678 return {
2679 numeric_array{get<0>(tmp)},
2680 numeric_array{get<1>(tmp)},
2681 numeric_array{get<2>(tmp)},
2682 numeric_array{get<3>(tmp)}};
2683#endif
2684 }
2685 }
2686
2688 transpose_detail<0, Columns...>(columns..., r);
2689 return r;
2690 }
2691
2692 [[nodiscard]] constexpr friend numeric_array composit(numeric_array const &under, numeric_array const &over) noexcept
2693 requires(N == 4 && std::is_floating_point_v<T>)
2694 {
2695 if (over.is_transparent()) {
2696 return under;
2697 }
2698 if (over.is_opaque()) {
2699 return over;
2700 }
2701
2702 hilet over_alpha = over.wwww();
2703 hilet under_alpha = under.wwww();
2704
2705 hilet over_color = over.xyz1();
2706 hilet under_color = under.xyz1();
2707
2708 hilet output_color = over_color * over_alpha + under_color * under_alpha * (T{1} - over_alpha);
2709
2710 return output_color / output_color.www1();
2711 }
2712
2713 [[nodiscard]] constexpr friend numeric_array composit(numeric_array const &under, numeric_array const &over) noexcept
2714 requires(is_f16x4)
2715 {
2716 return numeric_array{composit(static_cast<numeric_array<float, 4>>(under), static_cast<numeric_array<float, 4>>(over))};
2717 }
2718
2719 [[nodiscard]] friend std::string to_string(numeric_array const &rhs) noexcept
2720 {
2721 auto r = std::string{};
2722
2723 r += '(';
2724 for (std::size_t i = 0; i != N; ++i) {
2725 if (i != 0) {
2726 r += "; ";
2727 }
2728 r += std::format("{}", rhs[i]);
2729 }
2730 r += ')';
2731 return r;
2732 }
2733
2734 friend std::ostream &operator<<(std::ostream &lhs, numeric_array const &rhs)
2735 {
2736 return lhs << to_string(rhs);
2737 }
2738
2743 template<std::size_t FromElement, std::size_t ToElement>
2744 [[nodiscard]] constexpr friend numeric_array insert(numeric_array const &lhs, numeric_array const &rhs)
2745 {
2746 auto r = numeric_array{};
2747
2748 if (!std::is_constant_evaluated()) {
2749#if defined(HI_HAS_SSE4_1)
2750 if constexpr (is_f32x4) {
2751 constexpr uint8_t insert_mask = static_cast<uint8_t>((FromElement << 6) | (ToElement << 4));
2752 return numeric_array{_mm_insert_ps(lhs.reg(), rhs.reg(), insert_mask)};
2753
2754 } else if constexpr (is_i32x4 or is_u32x4) {
2755 constexpr uint8_t insert_mask = static_cast<uint8_t>((FromElement << 6) | (ToElement << 4));
2756 return numeric_array{
2757 _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(lhs.reg()), _mm_castsi128_ps(rhs.reg()), insert_mask))};
2758 }
2759#endif
2760#if defined(HI_HAS_SSE2)
2761 if constexpr (is_f64x2) {
2762 if constexpr (FromElement == 0 and ToElement == 0) {
2763 return numeric_array{_mm_shuffle_pd(rhs.reg(), lhs.reg(), 0b10)};
2764 } else if constexpr (FromElement == 1 and ToElement == 0) {
2765 return numeric_array{_mm_shuffle_pd(rhs.reg(), lhs.reg(), 0b11)};
2766 } else if constexpr (FromElement == 0 and ToElement == 1) {
2767 return numeric_array{_mm_shuffle_pd(lhs.reg(), rhs.reg(), 0b00)};
2768 } else {
2769 return numeric_array{_mm_shuffle_pd(lhs.reg(), rhs.reg(), 0b10)};
2770 }
2771
2772 } else if constexpr (is_i64x2 or is_u64x2) {
2773 hilet lhs_ = _mm_castsi128_pd(lhs.reg());
2774 hilet rhs_ = _mm_castsi128_pd(rhs.reg());
2775
2776 if constexpr (FromElement == 0 and ToElement == 0) {
2777 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(rhs_, lhs_, 0b10))};
2778 } else if constexpr (FromElement == 1 and ToElement == 0) {
2779 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(rhs_, lhs_, 0b11))};
2780 } else if constexpr (FromElement == 0 and ToElement == 1) {
2781 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(lhs_, rhs_, 0b00))};
2782 } else {
2783 return numeric_array{_mm_castpd_si128(_mm_shuffle_pd(lhs_, rhs_, 0b10))};
2784 }
2785 }
2786#endif
2787 }
2788
2789 for (std::size_t i = 0; i != N; ++i) {
2790 r[i] = (i == ToElement) ? rhs[FromElement] : lhs[i];
2791 }
2792
2793 return r;
2794 }
2795
2803 template<ssize_t... Elements>
2804 [[nodiscard]] constexpr numeric_array swizzle() const
2805 {
2806 static_assert(sizeof...(Elements) <= N);
2807
2808 if (!std::is_constant_evaluated()) {
2809#if defined(HI_HAS_AVX)
2810 if constexpr (is_f64x2) {
2811 return numeric_array{_mm_swizzle_pd<Elements...>(reg())};
2812 } else if constexpr (is_f32x4) {
2813 return numeric_array{_mm_swizzle_ps<Elements...>(reg())};
2814 } else if constexpr (is_i64x2 or is_u64x2) {
2815 return numeric_array{_mm_swizzle_epi64<Elements...>(reg())};
2816 } else if constexpr (is_i32x4 or is_u32x4) {
2817 return numeric_array{_mm_swizzle_epi32<Elements...>(reg())};
2818 }
2819#endif
2820 }
2821
2822 auto r = numeric_array{};
2823 swizzle_detail<0, Elements...>(r);
2824 return r;
2825 }
2826
2827#define SWIZZLE(swizzle_name, D, ...) \
2828 [[nodiscard]] constexpr numeric_array swizzle_name() const noexcept requires(D == N) \
2829 { \
2830 return swizzle<__VA_ARGS__>(); \
2831 }
2832
2833#define SWIZZLE_4D_GEN1(name, ...) \
2834 SWIZZLE(name##0, 4, __VA_ARGS__, get_zero) \
2835 SWIZZLE(name##1, 4, __VA_ARGS__, get_one) \
2836 SWIZZLE(name##x, 4, __VA_ARGS__, 0) \
2837 SWIZZLE(name##y, 4, __VA_ARGS__, 1) \
2838 SWIZZLE(name##z, 4, __VA_ARGS__, 2) \
2839 SWIZZLE(name##w, 4, __VA_ARGS__, 3)
2840
2841#define SWIZZLE_4D_GEN2(name, ...) \
2842 SWIZZLE_4D_GEN1(name##0, __VA_ARGS__, get_zero) \
2843 SWIZZLE_4D_GEN1(name##1, __VA_ARGS__, get_one) \
2844 SWIZZLE_4D_GEN1(name##x, __VA_ARGS__, 0) \
2845 SWIZZLE_4D_GEN1(name##y, __VA_ARGS__, 1) \
2846 SWIZZLE_4D_GEN1(name##z, __VA_ARGS__, 2) \
2847 SWIZZLE_4D_GEN1(name##w, __VA_ARGS__, 3)
2848
2849#define SWIZZLE_4D_GEN3(name, ...) \
2850 SWIZZLE_4D_GEN2(name##0, __VA_ARGS__, get_zero) \
2851 SWIZZLE_4D_GEN2(name##1, __VA_ARGS__, get_one) \
2852 SWIZZLE_4D_GEN2(name##x, __VA_ARGS__, 0) \
2853 SWIZZLE_4D_GEN2(name##y, __VA_ARGS__, 1) \
2854 SWIZZLE_4D_GEN2(name##z, __VA_ARGS__, 2) \
2855 SWIZZLE_4D_GEN2(name##w, __VA_ARGS__, 3)
2856
2857 SWIZZLE_4D_GEN3(_0, get_zero)
2858 SWIZZLE_4D_GEN3(_1, get_one)
2859 SWIZZLE_4D_GEN3(x, 0)
2860 SWIZZLE_4D_GEN3(y, 1)
2861 SWIZZLE_4D_GEN3(z, 2)
2862 SWIZZLE_4D_GEN3(w, 3)
2863
2864#define SWIZZLE_3D_GEN1(name, ...) \
2865 SWIZZLE(name##0, 3, __VA_ARGS__, get_zero) \
2866 SWIZZLE(name##1, 3, __VA_ARGS__, get_one) \
2867 SWIZZLE(name##x, 3, __VA_ARGS__, 0) \
2868 SWIZZLE(name##y, 3, __VA_ARGS__, 1) \
2869 SWIZZLE(name##z, 3, __VA_ARGS__, 2)
2870
2871#define SWIZZLE_3D_GEN2(name, ...) \
2872 SWIZZLE_3D_GEN1(name##0, __VA_ARGS__, get_zero) \
2873 SWIZZLE_3D_GEN1(name##1, __VA_ARGS__, get_one) \
2874 SWIZZLE_3D_GEN1(name##x, __VA_ARGS__, 0) \
2875 SWIZZLE_3D_GEN1(name##y, __VA_ARGS__, 1) \
2876 SWIZZLE_3D_GEN1(name##z, __VA_ARGS__, 2)
2877
2878 SWIZZLE_3D_GEN2(_0, get_zero)
2879 SWIZZLE_3D_GEN2(_1, get_one)
2880 SWIZZLE_3D_GEN2(x, 0)
2881 SWIZZLE_3D_GEN2(y, 1)
2882 SWIZZLE_3D_GEN2(z, 2)
2883
2884#define SWIZZLE_2D_GEN1(name, ...) \
2885 SWIZZLE(name##0, 2, __VA_ARGS__, get_zero) \
2886 SWIZZLE(name##1, 2, __VA_ARGS__, get_one) \
2887 SWIZZLE(name##x, 2, __VA_ARGS__, 0) \
2888 SWIZZLE(name##y, 2, __VA_ARGS__, 1)
2889
2890 SWIZZLE_2D_GEN1(_0, get_zero)
2891 SWIZZLE_2D_GEN1(_1, get_one)
2892 SWIZZLE_2D_GEN1(x, 0)
2893 SWIZZLE_2D_GEN1(y, 1)
2894
2895#undef SWIZZLE
2896#undef SWIZZLE_4D_GEN1
2897#undef SWIZZLE_4D_GEN2
2898#undef SWIZZLE_4D_GEN3
2899#undef SWIZZLE_3D_GEN1
2900#undef SWIZZLE_3D_GEN2
2901#undef SWIZZLE_2D_GEN1
2902
2903 template<int I, typename First, typename... Rest>
2904 friend constexpr void transpose_detail(First const &first, Rest const &...rest, std::array<numeric_array, N> &r) noexcept
2905 {
2906 for (std::size_t j = 0; j != N; ++j) {
2907 r[j][I] = first[j];
2908 }
2909
2910 if constexpr (sizeof...(Rest) != 0) {
2911 transpose_detail<I + 1, Rest...>(rest..., r);
2912 }
2913 }
2914
2915 template<ssize_t I, ssize_t FirstElement, ssize_t... RestElements>
2916 constexpr void swizzle_detail(numeric_array &r) const noexcept
2917 {
2918 static_assert(I < narrow_cast<ssize_t>(N));
2919 static_assert(FirstElement >= -2 && FirstElement < narrow_cast<ssize_t>(N), "Index out of bounds");
2920
2921 get<I>(r) = get<FirstElement>(*this);
2922 if constexpr (sizeof...(RestElements) != 0) {
2923 swizzle_detail<I + 1, RestElements...>(r);
2924 }
2925 }
2926};
2927
2928using i8x1 = numeric_array<int8_t, 1>;
2929using i8x2 = numeric_array<int8_t, 2>;
2930using i8x4 = numeric_array<int8_t, 4>;
2931using i8x8 = numeric_array<int8_t, 8>;
2932using i8x16 = numeric_array<int8_t, 16>;
2933using i8x32 = numeric_array<int8_t, 32>;
2934using i8x64 = numeric_array<int8_t, 64>;
2935
2936using u8x1 = numeric_array<uint8_t, 1>;
2937using u8x2 = numeric_array<uint8_t, 2>;
2938using u8x4 = numeric_array<uint8_t, 4>;
2939using u8x8 = numeric_array<uint8_t, 8>;
2940using u8x16 = numeric_array<uint8_t, 16>;
2941using u8x32 = numeric_array<uint8_t, 32>;
2942using u8x64 = numeric_array<uint8_t, 64>;
2943
2944using i16x1 = numeric_array<int16_t, 1>;
2945using i16x2 = numeric_array<int16_t, 2>;
2946using i16x4 = numeric_array<int16_t, 4>;
2947using i16x8 = numeric_array<int16_t, 8>;
2948using i16x16 = numeric_array<int16_t, 16>;
2949using i16x32 = numeric_array<int16_t, 32>;
2950
2951using u16x1 = numeric_array<uint16_t, 1>;
2952using u16x2 = numeric_array<uint16_t, 2>;
2953using u16x4 = numeric_array<uint16_t, 4>;
2954using u16x8 = numeric_array<uint16_t, 8>;
2955using u16x16 = numeric_array<uint16_t, 16>;
2956using u16x32 = numeric_array<uint16_t, 32>;
2957
2958using f16x4 = numeric_array<float16, 4>;
2959
2960using i32x1 = numeric_array<int32_t, 1>;
2961using i32x2 = numeric_array<int32_t, 2>;
2962using i32x4 = numeric_array<int32_t, 4>;
2963using i32x8 = numeric_array<int32_t, 8>;
2964using i32x16 = numeric_array<int32_t, 16>;
2965
2966using u32x1 = numeric_array<uint32_t, 1>;
2967using u32x2 = numeric_array<uint32_t, 2>;
2968using u32x4 = numeric_array<uint32_t, 4>;
2969using u32x8 = numeric_array<uint32_t, 8>;
2970using u32x16 = numeric_array<uint32_t, 16>;
2971
2972using f32x1 = numeric_array<float, 1>;
2973using f32x2 = numeric_array<float, 2>;
2974using f32x4 = numeric_array<float, 4>;
2975using f32x8 = numeric_array<float, 8>;
2976using f32x16 = numeric_array<float, 16>;
2977
2978using i64x1 = numeric_array<int64_t, 1>;
2979using i64x2 = numeric_array<int64_t, 2>;
2980using i64x4 = numeric_array<int64_t, 4>;
2981using i64x8 = numeric_array<int64_t, 8>;
2982
2983using u64x1 = numeric_array<uint64_t, 1>;
2984using u64x2 = numeric_array<uint64_t, 2>;
2985using u64x4 = numeric_array<uint64_t, 4>;
2986using u64x8 = numeric_array<uint64_t, 8>;
2987
2988using f64x1 = numeric_array<double, 1>;
2989using f64x2 = numeric_array<double, 2>;
2990using f64x4 = numeric_array<double, 4>;
2991using f64x8 = numeric_array<double, 8>;
2992
2993} // namespace hi::inline v1
2994
2995template<class T, std::size_t N>
2996struct std::tuple_size<hi::numeric_array<T, N>> : std::integral_constant<std::size_t, N> {
2997};
2998
2999template<std::size_t I, class T, std::size_t N>
3000struct std::tuple_element<I, hi::numeric_array<T, N>> {
3001 using type = T;
3002};
3003
3004hi_warning_pop();
std::ptrdiff_t ssize_t
Signed size/index into an array.
Definition required.hpp:37
#define hilet
Invariant should be the default for variables.
Definition required.hpp:23
Functions and macros for handling architectural difference between compilers, CPUs and operating syst...
constexpr alignment operator|(horizontal_alignment lhs, vertical_alignment rhs) noexcept
Combine vertical and horizontal alignment.
Definition alignment.hpp:200
STL namespace.
Definition numeric_array.hpp:54
friend constexpr T get(numeric_array &&rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:1032
hi_force_inline friend constexpr T squared_hypot(numeric_array const &rhs) noexcept
Take the squared length of the vector.
Definition numeric_array.hpp:1525
constexpr void store(std::byte *ptr) const noexcept
Store a numeric array into memory.
Definition numeric_array.hpp:684
hi_force_inline friend constexpr T dot(numeric_array const &lhs, numeric_array const &rhs) noexcept
Take a dot product.
Definition numeric_array.hpp:1483
friend constexpr numeric_array neg(numeric_array rhs) noexcept
Negate individual elements.
Definition numeric_array.hpp:1286
friend constexpr T get(numeric_array const &rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:1142
friend constexpr numeric_array cross_2D(numeric_array const &rhs) noexcept
Calculate the 2D normal on a 2D vector.
Definition numeric_array.hpp:2552
friend constexpr numeric_array reflect_point(numeric_array const &p, numeric_array const anchor) noexcept
Find the point on the other side and at the same distance of an anchor-point.
Definition numeric_array.hpp:2661
friend constexpr numeric_array midpoint(numeric_array const &p1, numeric_array const &p2) noexcept
Find a point at the midpoint between two points.
Definition numeric_array.hpp:2652
static constexpr numeric_array load(std::byte const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:657
friend constexpr T & get(numeric_array &rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:1020
friend constexpr numeric_array shuffle(numeric_array const &lhs, numeric_array const &rhs) noexcept
Shuffle a 16x byte array, using the indices from the right-hand-side.
Definition numeric_array.hpp:2627
friend constexpr numeric_array blend(numeric_array const &a, numeric_array const &b, numeric_array const &mask)
Blend the values using a dynamic mask.
Definition numeric_array.hpp:1236
constexpr friend T extract(numeric_array const &rhs) noexcept
Extract an element from the array.
Definition numeric_array.hpp:1052
constexpr friend numeric_array insert(numeric_array const &lhs, T rhs) noexcept
Insert a value in the array.
Definition numeric_array.hpp:1107
friend constexpr T rcp_hypot(numeric_array const &rhs) noexcept
Take a reciprocal of the length.
Definition numeric_array.hpp:1537
static constexpr numeric_array load(T const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:668
friend constexpr T hypot(numeric_array const &rhs) noexcept
Take the length of the vector.
Definition numeric_array.hpp:1512
friend constexpr float cross_2D(numeric_array const &lhs, numeric_array const &rhs) noexcept
Calculate the cross-product between two 2D vectors.
Definition numeric_array.hpp:2568
friend constexpr numeric_array normal_2D(numeric_array const &rhs) noexcept
Calculate the 2D unit-normal on a 2D vector.
Definition numeric_array.hpp:2560
static constexpr numeric_array interleave_lo(numeric_array a, numeric_array b) noexcept
Interleave the first words in both arrays.
Definition numeric_array.hpp:611
friend constexpr numeric_array addsub(numeric_array const &lhs, numeric_array const &rhs) noexcept
Add or subtract individual elements.
Definition numeric_array.hpp:2544
static constexpr numeric_array load(std::byte const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:646
friend constexpr numeric_array normalize(numeric_array const &rhs) noexcept
Normalize a vector.
Definition numeric_array.hpp:1559
constexpr friend numeric_array insert(numeric_array const &lhs, numeric_array const &rhs)
Insert an element from rhs into the result.
Definition numeric_array.hpp:2744
constexpr numeric_array swizzle() const
swizzle around the elements of the numeric array.
Definition numeric_array.hpp:2804
friend constexpr numeric_array zero(numeric_array rhs) noexcept
Set individual elements to zero.
Definition numeric_array.hpp:1160
Definition concepts.hpp:35
Definition concepts.hpp:38
T back(T... args)
T begin(T... args)
T ceil(T... args)
T data(T... args)
T empty(T... args)
T end(T... args)
T floor(T... args)
T front(T... args)
T max(T... args)
T max_size(T... args)
T memcpy(T... args)
T min(T... args)
T round(T... args)
T size(T... args)
T sqrt(T... args)
T to_string(T... args)