HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
numeric_array.hpp
1// Copyright Take Vos 2020-2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../architecture.hpp"
8#include "../concepts.hpp"
9#include "../cast.hpp"
10#include "../type_traits.hpp"
11#include "raw_numeric_array.hpp"
12#if TT_X86_64_V2
13#include "f32x4_x64v2.hpp"
14#include "i8x16_x64v2.hpp"
15#endif
16
17#if TT_X86_64_V1
18#include <xmmintrin.h> // SSE
19#include <emmintrin.h> // SSE2
20#endif
21#if TT_X86_64_V2
22#include <pmmintrin.h> // SSE3
23#include <tmmintrin.h> // SSSE3
24#include <smmintrin.h> // SSE4.1
25#include <nmmintrin.h> // SSE4.2
26#include <ammintrin.h> // SSE4A
27#endif
28#if TT_X86_64_V2_5
29#include <immintrin.h> // AVX, AVX2, FMA
30#endif
31
32#include <cstdint>
33#include <ostream>
34#include <string>
35#include <array>
36#include <type_traits>
37#include <concepts>
38#include <bit>
39#include <climits>
40
41namespace tt {
42
43template<numeric_limited T, size_t N>
46 using value_type = typename container_type::value_type;
47 using size_type = typename container_type::size_type;
48 using difference_type = typename container_type::difference_type;
49 using reference = typename container_type::reference;
50 using const_reference = typename container_type::const_reference;
51 using pointer = typename container_type::pointer;
52 using const_pointer = typename container_type::const_pointer;
53 using iterator = typename container_type::iterator;
54 using const_iterator = typename container_type::const_iterator;
55
56 constexpr static bool is_i8x1 = std::is_same_v<T, int8_t> && N == 1;
57 constexpr static bool is_i8x2 = std::is_same_v<T, int8_t> && N == 2;
58 constexpr static bool is_i8x4 = std::is_same_v<T, int8_t> && N == 4;
59 constexpr static bool is_i8x8 = std::is_same_v<T, int8_t> && N == 8;
60 constexpr static bool is_i8x16 = std::is_same_v<T, int8_t> && N == 16;
61 constexpr static bool is_i8x32 = std::is_same_v<T, int8_t> && N == 32;
62 constexpr static bool is_i8x64 = std::is_same_v<T, int8_t> && N == 64;
63 constexpr static bool is_u8x1 = std::is_same_v<T, uint8_t> && N == 1;
64 constexpr static bool is_u8x2 = std::is_same_v<T, uint8_t> && N == 2;
65 constexpr static bool is_u8x4 = std::is_same_v<T, uint8_t> && N == 4;
66 constexpr static bool is_u8x8 = std::is_same_v<T, uint8_t> && N == 8;
67 constexpr static bool is_u8x16 = std::is_same_v<T, uint8_t> && N == 16;
68 constexpr static bool is_u8x32 = std::is_same_v<T, uint8_t> && N == 32;
69 constexpr static bool is_u8x64 = std::is_same_v<T, uint8_t> && N == 64;
70
71 constexpr static bool is_i16x1 = std::is_same_v<T, int16_t> && N == 1;
72 constexpr static bool is_i16x2 = std::is_same_v<T, int16_t> && N == 2;
73 constexpr static bool is_i16x4 = std::is_same_v<T, int16_t> && N == 4;
74 constexpr static bool is_i16x8 = std::is_same_v<T, int16_t> && N == 8;
75 constexpr static bool is_i16x16 = std::is_same_v<T, int16_t> && N == 16;
76 constexpr static bool is_i16x32 = std::is_same_v<T, int16_t> && N == 32;
77 constexpr static bool is_u16x1 = std::is_same_v<T, uint16_t> && N == 1;
78 constexpr static bool is_u16x2 = std::is_same_v<T, uint16_t> && N == 2;
79 constexpr static bool is_u16x4 = std::is_same_v<T, uint16_t> && N == 4;
80 constexpr static bool is_u16x8 = std::is_same_v<T, uint16_t> && N == 8;
81 constexpr static bool is_u16x16 = std::is_same_v<T, uint16_t> && N == 16;
82 constexpr static bool is_u16x32 = std::is_same_v<T, uint16_t> && N == 32;
83
84 constexpr static bool is_i32x1 = std::is_same_v<T, int32_t> && N == 1;
85 constexpr static bool is_i32x2 = std::is_same_v<T, int32_t> && N == 2;
86 constexpr static bool is_i32x4 = std::is_same_v<T, int32_t> && N == 4;
87 constexpr static bool is_i32x8 = std::is_same_v<T, int32_t> && N == 8;
88 constexpr static bool is_i32x16 = std::is_same_v<T, int32_t> && N == 16;
89 constexpr static bool is_u32x1 = std::is_same_v<T, uint32_t> && N == 1;
90 constexpr static bool is_u32x2 = std::is_same_v<T, uint32_t> && N == 2;
91 constexpr static bool is_u32x4 = std::is_same_v<T, uint32_t> && N == 4;
92 constexpr static bool is_u32x8 = std::is_same_v<T, uint32_t> && N == 8;
93 constexpr static bool is_u32x16 = std::is_same_v<T, uint32_t> && N == 16;
94 constexpr static bool is_f32x1 = std::is_same_v<T, float> && N == 1;
95 constexpr static bool is_f32x2 = std::is_same_v<T, float> && N == 2;
96 constexpr static bool is_f32x4 = std::is_same_v<T, float> && N == 4;
97 constexpr static bool is_f32x8 = std::is_same_v<T, float> && N == 8;
98 constexpr static bool is_f32x16 = std::is_same_v<T, float> && N == 16;
99
100 constexpr static bool is_i64x1 = std::is_same_v<T, int64_t> && N == 1;
101 constexpr static bool is_i64x2 = std::is_same_v<T, int64_t> && N == 2;
102 constexpr static bool is_i64x4 = std::is_same_v<T, int64_t> && N == 4;
103 constexpr static bool is_i64x8 = std::is_same_v<T, int64_t> && N == 8;
104 constexpr static bool is_u64x1 = std::is_same_v<T, uint64_t> && N == 1;
105 constexpr static bool is_u64x2 = std::is_same_v<T, uint64_t> && N == 2;
106 constexpr static bool is_u64x4 = std::is_same_v<T, uint64_t> && N == 4;
107 constexpr static bool is_u64x8 = std::is_same_v<T, uint64_t> && N == 8;
108 constexpr static bool is_f64x1 = std::is_same_v<T, double> && N == 1;
109 constexpr static bool is_f64x2 = std::is_same_v<T, double> && N == 2;
110 constexpr static bool is_f64x4 = std::is_same_v<T, double> && N == 4;
111 constexpr static bool is_f64x8 = std::is_same_v<T, double> && N == 8;
112
114
115 constexpr numeric_array() noexcept = default;
116 constexpr numeric_array(numeric_array const &rhs) noexcept = default;
117 constexpr numeric_array(numeric_array &&rhs) noexcept = default;
118 constexpr numeric_array &operator=(numeric_array const &rhs) noexcept = default;
119 constexpr numeric_array &operator=(numeric_array &&rhs) noexcept = default;
120
121 template<arithmetic U, size_t M>
122 [[nodiscard]] constexpr explicit numeric_array(numeric_array<U, M> const &other) noexcept : v()
123 {
124 if (!std::is_constant_evaluated()) {
125 if constexpr (is_f64x2 and other.is_i32x4) {
126#if defined(TT_HAS_SSE2)
127 *this = numeric_array{_mm_cvtepi32_pd(other.reg())};
128 return;
129#endif
130 } else if constexpr (is_f32x4 and other.is_i32x4) {
131#if defined(TT_HAS_SSE2)
132 *this = numeric_array{_mm_cvtepi32_ps(other.reg())};
133 return;
134#endif
135 } else if constexpr (is_i32x4 and other.is_f32x4) {
136#if defined(TT_HAS_SSE2)
137 *this = numeric_array{_mm_cvtps_epi32(other.reg())};
138 return;
139#endif
140 } else if constexpr (is_i64x4 and other.is_i32x4) {
141#if defined(TT_HAS_SSE4_1)
142 *this = numeric_array{_mm_cvtepi32_epi64(other.reg())};
143 return;
144#endif
145 } else if constexpr (is_i64x4 and other.is_i16x8) {
146#if defined(TT_HAS_SSE4_1)
147 *this = numeric_array{_mm_cvtepi16_epi64(other.reg())};
148 return;
149#endif
150 } else if constexpr (is_i32x4 and other.is_i16x8) {
151#if defined(TT_HAS_SSE4_1)
152 *this = numeric_array{_mm_cvtepi16_epi32(other.reg())};
153 return;
154#endif
155 } else if constexpr (is_i64x2 and other.is_i8x16) {
156#if defined(TT_HAS_SSE4_1)
157 *this = numeric_array{_mm_cvtepi8_epi64(other.reg())};
158 return;
159#endif
160 } else if constexpr (is_i32x4 and other.is_i8x16) {
161#if defined(TT_HAS_SSE4_1)
162 *this = numeric_array{_mm_cvtepi8_epi32(other.reg())};
163 return;
164#endif
165 } else if constexpr (is_i16x8 and other.is_i8x16) {
166#if defined(TT_HAS_SSE4_1)
167 *this = numeric_array{_mm_cvtepi8_epi16(other.reg())};
168 return;
169#endif
170 } else if constexpr (is_f64x4 and other.is_f32x4) {
171#if defined(TT_HAS_AVX)
172 *this = numeric_array{_mm256_cvteps_pd(other.reg())};
173 return;
174#endif
175 } else if constexpr (is_f64x4 and other.is_i32x4) {
176#if defined(TT_HAS_AVX)
177 *this = numeric_array{_mm256_cvtepi32_pd(other.reg())};
178 return;
179#endif
180 } else if constexpr (is_f32x4 and other.is_f64x4) {
181#if defined(TT_HAS_AVX)
182 *this = numeric_array{_mm256_cvtpd_ps(other.reg())};
183 return;
184#endif
185 } else if constexpr (is_i32x4 and other.is_f64x4) {
186#if defined(TT_HAS_AVX)
187 *this = numeric_array{_mm256_cvtpd_epi32(other.reg())};
188 return;
189#endif
190 } else if constexpr (is_i32x8 and other.is_f32x8) {
191#if defined(TT_HAS_AVX)
192 *this = numeric_array{_mm256_cvtps_epi32(other.reg())};
193 return;
194#endif
195 } else if constexpr (is_f32x8 and other.is_i32x8) {
196#if defined(TT_HAS_AVX)
197 *this = numeric_array{_mm256_cvtepi32_ps(other.reg())};
198 return;
199#endif
200 }
201 }
202
203 for (size_t i = 0; i != N; ++i) {
204 if (i < M) {
205 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
206 // SSE conversion round floats before converting to integer.
207 v[i] = static_cast<value_type>(std::round(other[i]));
208 } else {
209 v[i] = static_cast<value_type>(other[i]);
210 }
211 } else {
212 v[i] = T{};
213 }
214 }
215 }
216
217 template<arithmetic U, size_t M>
218 [[nodiscard]] constexpr explicit numeric_array(numeric_array<U, M> const &other1, numeric_array<U, M> const &other2) noexcept
219 :
220 v()
221 {
222 if (!std::is_constant_evaluated()) {
223 if constexpr (is_i16x8 and other1.is_i32x4 and other2.is_i32x4) {
224#if defined(TT_HAS_SSE2)
225 *this = numeric_array{_mm_packs_epi32(other2.reg(), other1.reg())};
226 return;
227#endif
228 } else if constexpr (is_i8x16 and other1.is_i16x8 and other2.is_i16x8) {
229#if defined(TT_HAS_SSE2)
230 *this = numeric_array{_mm_packs_epi16(other2.reg(), other1.reg())};
231 return;
232#endif
233 } else if constexpr (is_u8x16 and other1.is_u16x8 and other2.is_u16x8) {
234#if defined(TT_HAS_SSE2)
235 *this = numeric_array{_mm_packus_epu16(other2.reg(), other1.reg())};
236 return;
237#endif
238 } else if constexpr (is_u16x8 and other1.is_u32x4 and other2.is_u32x4) {
239#if defined(TT_HAS_SSE4_1)
240 *this = numeric_array{_mm_packus_epu32(other2.reg(), other1.reg())};
241 return;
242#endif
243 } else if constexpr (is_f32x8 and other1.is_f32x4 and other2.is_f32x4) {
244#if defined(TT_HAS_AVX)
245 *this = numeric_array{_mm256_set_m128(other2.reg(), other1.reg())};
246 return;
247#endif
248 } else if constexpr (is_f64x4 and other1.is_f64x2 and other2.is_f64x2) {
249#if defined(TT_HAS_AVX)
250 *this = numeric_array{_mm256_set_m128d(other2.reg(), other1.reg())};
251 return;
252#endif
253 } else if constexpr (
254 std::is_integral_v<T> and std::is_integral_v<U> and (sizeof(T) * N == 32) and (sizeof(U) * M == 16)) {
255#if defined(TT_HAS_AVX)
256 *this = numeric_array{_mm256_set_m128i(other2.reg(), other1.reg())};
257 return;
258#endif
259 }
260 }
261
262 for (size_t i = 0; i != N; ++i) {
263 if (i < M) {
264 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
265 // SSE conversion round floats before converting to integer.
266 v[i] = static_cast<value_type>(std::round(other1[i]));
267 } else {
268 v[i] = static_cast<value_type>(other1[i]);
269 }
270 } else if (i < M * 2) {
271 if constexpr (std::is_integral_v<T> and std::is_floating_point_v<U>) {
272 // SSE conversion round floats before converting to integer.
273 v[i] = static_cast<value_type>(std::round(other2[i - M]));
274 } else {
275 v[i] = static_cast<value_type>(other2[i - M]);
276 }
277 } else {
278 v[i] = U{};
279 }
280 }
281 }
282
283 [[nodiscard]] constexpr numeric_array(std::initializer_list<T> rhs) noexcept : v()
284 {
285 auto src = std::begin(rhs);
286 auto dst = std::begin(v);
287
288 // Copy all values from the initializer list.
289 while (src != std::end(rhs) && dst != std::end(v)) {
290 *(dst++) = *(src++);
291 }
292
293 tt_axiom(
294 dst != std::end(v) || src == std::end(rhs),
295 "Expecting the std:initializer_list size to be <= to the size of the numeric array");
296
297 // Set all other elements to zero
298 while (dst != std::end(v)) {
299 *(dst++) = {};
300 }
301 }
302
303 [[nodiscard]] constexpr numeric_array(T const &first) noexcept requires(N == 1) : numeric_array({first}) {}
304
305 template<arithmetic... Rest>
306 requires(sizeof...(Rest) + 2 <= N)
307 [[nodiscard]] constexpr numeric_array(T const &first, T const &second, Rest const &...rest) noexcept :
308 numeric_array({first, second, narrow_cast<T>(rest)...})
309 {
310 }
311
312 [[nodiscard]] static constexpr numeric_array broadcast(T rhs) noexcept
313 {
314 auto r = numeric_array{};
315 for (size_t i = 0; i != N; ++i) {
316 r[i] = rhs;
317 }
318 return r;
319 }
320
321 [[nodiscard]] numeric_array(std::array<T, N> const &rhs) noexcept : v(rhs) {}
322
323 numeric_array &operator=(std::array<T, N> const &rhs) noexcept
324 {
325 v = rhs;
326 return *this;
327 }
328
329 [[nodiscard]] operator std::array<T, N>() const noexcept
330 {
331 return v;
332 }
333
334#if defined(TT_HAS_SSE2)
335 [[nodiscard]] __m128i reg() const noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
336 {
337 return _mm_loadu_si128(reinterpret_cast<__m128i const *>(v.data()));
338 }
339#endif
340
341#if defined(TT_HAS_SSE2)
342 [[nodiscard]] __m128 reg() const noexcept requires(is_f32x4)
343 {
344 return _mm_loadu_ps(v.data());
345 }
346#endif
347
348#if defined(TT_HAS_SSE2)
349 [[nodiscard]] __m128d reg() const noexcept requires(is_f64x2)
350 {
351 return _mm_loadu_pd(v.data());
352 }
353#endif
354
355#if defined(TT_HAS_SSE2)
356 [[nodiscard]] explicit numeric_array(__m128i const &rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
357 {
358 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), rhs);
359 }
360#endif
361
362#if defined(TT_HAS_SSE2)
363 [[nodiscard]] explicit numeric_array(__m128 const &rhs) noexcept requires(is_f32x4)
364 {
365 _mm_storeu_ps(v.data(), rhs);
366 }
367#endif
368
369#if defined(TT_HAS_SSE2)
370 [[nodiscard]] explicit numeric_array(__m128d const &rhs) noexcept requires(is_f64x2)
371 {
372 _mm_storeu_pd(v.data(), rhs);
373 }
374#endif
375
376#if defined(TT_HAS_SSE2)
377 numeric_array &operator=(__m128i const &rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 16)
378 {
379 _mm_storeu_si128(reinterpret_cast<__m128i *>(v.data()), rhs);
380 return *this;
381 }
382#endif
383
384#if defined(TT_HAS_SSE2)
385 numeric_array &operator=(__m128 const &rhs) noexcept requires(is_f32x4)
386 {
387 _mm_storeu_ps(v.data(), rhs);
388 return *this;
389 }
390#endif
391
392#if defined(TT_HAS_SSE2)
393 numeric_array &operator=(__m128d const &rhs) noexcept requires(is_f64x2)
394 {
395 _mm_storeu_pd(v.data(), rhs);
396 return *this;
397 }
398#endif
399
400#if defined(TT_HAS_AVX)
401 [[nodiscard]] __m256i reg() const noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
402 {
403 return _mm256_loadu_si256(reinterpret_cast<__m256i const *>(v.data()));
404 }
405#endif
406
407#if defined(TT_HAS_AVX)
408 [[nodiscard]] __m256 reg() const noexcept requires(is_f32x8)
409 {
410 return _mm256_loadu_ps(v.data());
411 }
412#endif
413
414#if defined(TT_HAS_AVX)
415 [[nodiscard]] __m256d reg() const noexcept requires(is_f64x4)
416 {
417 return _mm256_loadu_pd(v.data());
418 }
419#endif
420
421#if defined(TT_HAS_AVX)
422 [[nodiscard]] explicit numeric_array(__m256i const &rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
423 {
424 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), rhs);
425 }
426#endif
427
428#if defined(TT_HAS_AVX)
429 [[nodiscard]] explicit numeric_array(__m256 const &rhs) noexcept requires(is_f32x8)
430 {
431 _mm256_storeu_ps(v.data(), rhs);
432 }
433#endif
434
435#if defined(TT_HAS_AVX)
436 [[nodiscard]] explicit numeric_array(__m256d const &rhs) noexcept requires(is_f64x4)
437 {
438 _mm256_storeu_pd(v.data(), rhs);
439 }
440#endif
441
442#if defined(TT_HAS_AVX)
443 numeric_array &operator=(__m256i const &rhs) noexcept requires(std::is_integral_v<T> and sizeof(T) * N == 32)
444 {
445 _mm256_storeu_si256(reinterpret_cast<__m256i *>(v.data()), rhs);
446 return *this;
447 }
448#endif
449
450#if defined(TT_HAS_AVX)
451 numeric_array &operator=(__m256 const &rhs) noexcept requires(is_f32x8)
452 {
453 _mm256_storeu_ps(v.data(), rhs);
454 return *this;
455 }
456#endif
457
458#if defined(TT_HAS_AVX)
459 numeric_array &operator=(__m256d const &rhs) noexcept requires(is_f64x4)
460 {
461 _mm256_storeu_pd(v.data(), rhs);
462 return *this;
463 }
464#endif
465
466 template<typename Other>
467 [[nodiscard]] constexpr friend Other bit_cast(numeric_array const &rhs) noexcept
468 requires(sizeof(Other) == sizeof(container_type))
469 {
470 using rhs_value_type = typename std::remove_cvref_t<decltype(rhs)>::value_type;
471
472 if (not std::is_constant_evaluated()) {
473 if constexpr (Other::is_f32x4 and std::is_integral_v<rhs_value_type>) {
474#if defined(TT_HAS_SSE2)
475 return Other{_mm_castsi128_ps(rhs.reg())};
476#endif
477 } else if constexpr (Other::is_f32x4 and rhs.is_f64x2) {
478#if defined(TT_HAS_SSE2)
479 return Other{_mm_castpd_ps(rhs.reg())};
480#endif
481 } else if constexpr (Other::is_f64x2 and std::is_integral_v<rhs_value_type>) {
482#if defined(TT_HAS_SSE2)
483 return Other{_mm_castsi128_pd(rhs.reg())};
484#endif
485 } else if constexpr (Other::is_f64x2 and rhs.is_f32x4) {
486#if defined(TT_HAS_SSE2)
487 return Other{_mm_castps_pd(rhs.reg())};
488#endif
489 } else if constexpr (std::is_integral_v<Other::value_type> and rhs.is_f32x4) {
490#if defined(TT_HAS_SSE2)
491 return Other{_mm_castps_si128(rhs.reg())};
492#endif
493 } else if constexpr (std::is_integral_v<Other::value_type> and rhs.is_f64x2) {
494#if defined(TT_HAS_SSE2)
495 return Other{_mm_castpd_si128(rhs.reg())};
496#endif
497 } else if constexpr (std::is_integral_v<Other::value_type> and std::is_integral_v<rhs_value_type>) {
498#if defined(TT_HAS_SSE2)
499 return Other{rhs.reg()};
500#endif
501 }
502 }
503 return std::bit_cast<Other>(rhs);
504 }
505
509 {
510 if (not std::is_constant_evaluated()) {
511 if constexpr (x86_64_v2 and is_f64x2) {
512 return numeric_array{_mm_unpacklo_pd(a.reg(), b.reg())};
513 } else if constexpr (x86_64_v2 and is_f32x4) {
514 return numeric_array{_mm_unpacklo_ps(a.reg(), b.reg())};
515 } else if constexpr (x86_64_v2 and is_i64x2) {
516 return numeric_array{_mm_unpacklo_epi64(a.reg(), b.reg())};
517 } else if constexpr (x86_64_v2 and is_i32x4) {
518 return numeric_array{_mm_unpacklo_epi32(a.reg(), b.reg())};
519 } else if constexpr (x86_64_v2 and is_i16x8) {
520 return numeric_array{_mm_unpacklo_epi16(a.reg(), b.reg())};
521 } else if constexpr (x86_64_v2 and is_i8x16) {
522 return numeric_array{_mm_unpacklo_epi8(a.reg(), b.reg())};
523 }
524 }
525
526 auto r = numeric_array{};
527 for (size_t i = 0; i != N; ++i) {
528 r[i] = (i % 2 == 0) ? a[i / 2] : b[i / 2];
529 }
530 return r;
531 }
532
537 template<size_t S>
538 [[nodiscard]] static constexpr numeric_array load(std::byte const *ptr) noexcept
539 {
540 auto r = numeric_array{};
541 std::memcpy(&r, ptr, S);
542 return r;
543 }
544
549 [[nodiscard]] static constexpr numeric_array load(std::byte const *ptr) noexcept
550 {
551 auto r = numeric_array{};
552 std::memcpy(&r, ptr, sizeof(r));
553 return r;
554 }
555
560 [[nodiscard]] static constexpr numeric_array load(T const *ptr) noexcept
561 {
562 auto r = numeric_array{};
563 std::memcpy(&r, ptr, sizeof(r));
564 return r;
565 }
566
567 template<size_t S>
568 constexpr void store(std::byte *ptr) const noexcept
569 {
570 std::memcpy(ptr, this, S);
571 }
572
576 constexpr void store(std::byte *ptr) const noexcept
577 {
578 store<sizeof(*this)>(ptr);
579 }
580
581 [[nodiscard]] constexpr T const &operator[](size_t i) const noexcept
582 {
583 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
584 tt_axiom(i < N);
585 return v[i];
586 }
587
588 [[nodiscard]] constexpr T &operator[](size_t i) noexcept
589 {
590 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
591 tt_axiom(i < N);
592 return v[i];
593 }
594
595 [[nodiscard]] constexpr reference front() noexcept
596 {
597 return v.front();
598 }
599
600 [[nodiscard]] constexpr const_reference front() const noexcept
601 {
602 return v.front();
603 }
604
605 [[nodiscard]] constexpr reference back() noexcept
606 {
607 return v.back();
608 }
609
610 [[nodiscard]] constexpr const_reference back() const noexcept
611 {
612 return v.back();
613 }
614
615 [[nodiscard]] constexpr pointer data() noexcept
616 {
617 return v.data();
618 }
619
620 [[nodiscard]] constexpr const_pointer data() const noexcept
621 {
622 return v.data();
623 }
624
625 [[nodiscard]] constexpr iterator begin() noexcept
626 {
627 return v.begin();
628 }
629
630 [[nodiscard]] constexpr const_iterator begin() const noexcept
631 {
632 return v.begin();
633 }
634
635 [[nodiscard]] constexpr const_iterator cbegin() const noexcept
636 {
637 return v.cbegin();
638 }
639
640 [[nodiscard]] constexpr iterator end() noexcept
641 {
642 return v.end();
643 }
644
645 [[nodiscard]] constexpr const_iterator end() const noexcept
646 {
647 return v.end();
648 }
649
650 [[nodiscard]] constexpr const_iterator cend() const noexcept
651 {
652 return v.cend();
653 }
654
655 [[nodiscard]] constexpr bool empty() const noexcept
656 {
657 return v.empty();
658 }
659
660 [[nodiscard]] constexpr size_type size() const noexcept
661 {
662 return v.size();
663 }
664
665 [[nodiscard]] constexpr size_type max_size() const noexcept
666 {
667 return v.max_size();
668 }
669
670 constexpr bool is_point() const noexcept
671 {
672 return v.back() != T{};
673 }
674
675 constexpr bool is_vector() const noexcept
676 {
677 return v.back() == T{};
678 }
679
680 constexpr bool is_opaque() const noexcept
681 {
682 return a() == T{1};
683 }
684
685 constexpr bool is_transparent() const noexcept
686 {
687 return a() == T{0};
688 }
689
690 [[nodiscard]] constexpr T const &x() const noexcept requires(N >= 1)
691 {
692 return std::get<0>(v);
693 }
694
695 [[nodiscard]] constexpr T const &y() const noexcept requires(N >= 2)
696 {
697 return std::get<1>(v);
698 }
699
700 [[nodiscard]] constexpr T const &z() const noexcept requires(N >= 3)
701 {
702 return std::get<2>(v);
703 }
704
705 [[nodiscard]] constexpr T const &w() const noexcept requires(N >= 4)
706 {
707 return std::get<3>(v);
708 }
709
710 [[nodiscard]] constexpr T &x() noexcept requires(N >= 1)
711 {
712 return std::get<0>(v);
713 }
714
715 [[nodiscard]] constexpr T &y() noexcept requires(N >= 2)
716 {
717 return std::get<1>(v);
718 }
719
720 [[nodiscard]] constexpr T &z() noexcept requires(N >= 3)
721 {
722 return std::get<2>(v);
723 }
724
725 [[nodiscard]] constexpr T &w() noexcept requires(N >= 4)
726 {
727 return std::get<3>(v);
728 }
729
730 [[nodiscard]] constexpr T const &r() const noexcept requires(N >= 1)
731 {
732 return std::get<0>(v);
733 }
734
735 [[nodiscard]] constexpr T const &g() const noexcept requires(N >= 2)
736 {
737 return std::get<1>(v);
738 }
739
740 [[nodiscard]] constexpr T const &b() const noexcept requires(N >= 3)
741 {
742 return std::get<2>(v);
743 }
744
745 [[nodiscard]] constexpr T const &a() const noexcept requires(N >= 4)
746 {
747 return std::get<3>(v);
748 }
749
750 [[nodiscard]] constexpr T &r() noexcept requires(N >= 1)
751 {
752 return std::get<0>(v);
753 }
754
755 [[nodiscard]] constexpr T &g() noexcept requires(N >= 2)
756 {
757 return std::get<1>(v);
758 }
759
760 [[nodiscard]] constexpr T &b() noexcept requires(N >= 3)
761 {
762 return std::get<2>(v);
763 }
764
765 [[nodiscard]] constexpr T &a() noexcept requires(N >= 4)
766 {
767 return std::get<3>(v);
768 }
769
770 [[nodiscard]] constexpr T const &width() const noexcept requires(N >= 1)
771 {
772 return std::get<0>(v);
773 }
774
775 [[nodiscard]] constexpr T const &height() const noexcept requires(N >= 2)
776 {
777 return std::get<1>(v);
778 }
779
780 [[nodiscard]] constexpr T const &depth() const noexcept requires(N >= 3)
781 {
782 return std::get<2>(v);
783 }
784
785 [[nodiscard]] constexpr T &width() noexcept requires(N >= 1)
786 {
787 return std::get<0>(v);
788 }
789
790 [[nodiscard]] constexpr T &height() noexcept requires(N >= 2)
791 {
792 return std::get<1>(v);
793 }
794
795 [[nodiscard]] constexpr T &depth() noexcept requires(N >= 3)
796 {
797 return std::get<2>(v);
798 }
799
800 constexpr numeric_array &operator<<=(unsigned int rhs) noexcept
801 {
802 return *this = *this << rhs;
803 }
804
805 constexpr numeric_array &operator>>=(unsigned int rhs) noexcept
806 {
807 return *this = *this >> rhs;
808 }
809
810 constexpr numeric_array &operator|=(numeric_array const &rhs) noexcept
811 {
812 return *this = *this | rhs;
813 }
814
815 constexpr numeric_array &operator|=(T const &rhs) noexcept
816 {
817 return *this = *this | rhs;
818 }
819
820 constexpr numeric_array &operator&=(numeric_array const &rhs) noexcept
821 {
822 return *this = *this & rhs;
823 }
824
825 constexpr numeric_array &operator&=(T const &rhs) noexcept
826 {
827 return *this = *this & rhs;
828 }
829
830 constexpr numeric_array &operator^=(numeric_array const &rhs) noexcept
831 {
832 return *this = *this ^ rhs;
833 }
834
835 constexpr numeric_array &operator^=(T const &rhs) noexcept
836 {
837 return *this = *this ^ rhs;
838 }
839
840 constexpr numeric_array &operator+=(numeric_array const &rhs) noexcept
841 {
842 return *this = *this + rhs;
843 }
844
845 constexpr numeric_array &operator+=(T const &rhs) noexcept
846 {
847 return *this = *this + rhs;
848 }
849
850 constexpr numeric_array &operator-=(numeric_array const &rhs) noexcept
851 {
852 return *this = *this - rhs;
853 }
854
855 constexpr numeric_array &operator-=(T const &rhs) noexcept
856 {
857 return *this = *this - rhs;
858 }
859
860 constexpr numeric_array &operator*=(numeric_array const &rhs) noexcept
861 {
862 return *this = *this * rhs;
863 }
864
865 constexpr numeric_array &operator*=(T const &rhs) noexcept
866 {
867 return *this = *this * rhs;
868 }
869
870 constexpr numeric_array &operator/=(numeric_array const &rhs) noexcept
871 {
872 return *this = *this / rhs;
873 }
874
875 constexpr numeric_array &operator/=(T const &rhs) noexcept
876 {
877 return *this = *this / rhs;
878 }
879
880 constexpr numeric_array &operator%=(numeric_array const &rhs) noexcept
881 {
882 return *this = *this % rhs;
883 }
884
885 constexpr numeric_array &operator%=(T const &rhs) noexcept
886 {
887 return *this = *this % rhs;
888 }
889
890 constexpr static ssize_t get_zero = -1;
891 constexpr static ssize_t get_one = -2;
892
897 template<size_t I>
898 [[nodiscard]] friend constexpr T &get(numeric_array &rhs) noexcept
899 {
900 static_assert(I < N, "Index out of bounds");
901 return std::get<I>(rhs.v);
902 }
903
909 template<ssize_t I>
910 [[nodiscard]] friend constexpr T get(numeric_array &&rhs) noexcept
911 {
912 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
913 static_assert(I >= -2 && I < narrow_cast<ssize_t>(N), "Index out of bounds");
914 if constexpr (I == get_zero) {
915 return T{0};
916 } else if constexpr (I == get_one) {
917 return T{1};
918 } else {
919 return std::get<I>(rhs.v);
920 }
921 }
922
928 template<ssize_t I>
929 [[nodiscard]] friend constexpr T get(numeric_array const &rhs) noexcept
930 {
931 static_assert(std::endian::native == std::endian::little, "Indices need to be reversed on big endian machines");
932 static_assert(I >= -2 && I < narrow_cast<ssize_t>(N), "Index out of bounds");
933 if constexpr (I == get_zero) {
934 return T{0};
935 } else if constexpr (I == get_one) {
936 return T{1};
937 } else {
938 return std::get<I>(rhs.v);
939 }
940 }
941
946 template<size_t Mask = ~size_t{0}>
947 [[nodiscard]] friend constexpr numeric_array zero(numeric_array rhs) noexcept
948 {
949 if (!std::is_constant_evaluated()) {
950 if constexpr (is_f32x4 && x86_64_v2) {
951 return numeric_array{f32x4_x64v2_zero<Mask & 0xf>(rhs.v)};
952 }
953 }
954
955 auto r = numeric_array{};
956 for (size_t i = 0; i != N; ++i) {
957 if (static_cast<bool>((Mask >> i) & 1)) {
958 r.v[i] = T{0};
959 } else {
960 r.v[i] = rhs.v[i];
961 }
962 }
963 return r;
964 }
965
970 template<size_t Mask = ~size_t{0}>
971 [[nodiscard]] friend constexpr numeric_array neg(numeric_array rhs) noexcept
972 {
973 if (!std::is_constant_evaluated()) {
974 if constexpr (is_f32x4 && x86_64_v2) {
975 return numeric_array{f32x4_x64v2_neg<Mask & 0xf>(rhs.v)};
976 }
977 }
978
979 auto r = numeric_array{};
980 for (size_t i = 0; i != N; ++i) {
981 if (static_cast<bool>((Mask >> i) & 1)) {
982 r.v[i] = -rhs.v[i];
983 } else {
984 r.v[i] = rhs.v[i];
985 }
986 }
987 return r;
988 }
989
990 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const &rhs) noexcept
991 {
992 auto r = numeric_array{};
993 for (size_t i = 0; i != N; ++i) {
994 // -rhs.v[i] will cause a memory load with msvc.
995 r.v[i] = T{} - rhs.v[i];
996 }
997 return r;
998 }
999
1000 [[nodiscard]] friend constexpr numeric_array abs(numeric_array const &rhs) noexcept
1001 {
1002 auto neg_rhs = -rhs;
1003
1004 auto r = numeric_array{};
1005 for (size_t i = 0; i != N; ++i) {
1006 r.v[i] = rhs.v[i] < T{} ? neg_rhs.v[i] : rhs.v[i];
1007 }
1008 return r;
1009 }
1010
1011 [[nodiscard]] friend constexpr numeric_array rcp(numeric_array const &rhs) noexcept
1012 {
1013 if (!std::is_constant_evaluated()) {
1014 if constexpr (is_f32x4 and x86_64_v2) {
1015 return numeric_array{_mm_rcp_ps(rhs.reg())};
1016 }
1017 }
1018
1019 auto r = numeric_array{};
1020 for (size_t i = 0; i != N; ++i) {
1021 r[i] = 1.0f / rhs.v[i];
1022 }
1023 return r;
1024 }
1025
1026 [[nodiscard]] friend constexpr numeric_array sqrt(numeric_array const &rhs) noexcept
1027 {
1028 if (!std::is_constant_evaluated()) {
1029 if constexpr (is_f32x4 and x86_64_v2) {
1030 return numeric_array{_mm_sqrt_ps(rhs.reg())};
1031 }
1032 }
1033
1034 auto r = numeric_array{};
1035 for (size_t i = 0; i != N; ++i) {
1036 r[i] = std::sqrt(rhs.v[i]);
1037 }
1038 return r;
1039 }
1040
1041 [[nodiscard]] friend constexpr numeric_array rcp_sqrt(numeric_array const &rhs) noexcept
1042 {
1043 if (!std::is_constant_evaluated()) {
1044 if constexpr (is_f32x4 and x86_64_v2) {
1045 return numeric_array{_mm_rcp_sqrt_ps(rhs.reg())};
1046 }
1047 }
1048
1049 auto r = numeric_array{};
1050 for (size_t i = 0; i != N; ++i) {
1051 r[i] = 1.0f / std::sqrt(rhs.v[i]);
1052 }
1053 return r;
1054 }
1055
1056 [[nodiscard]] friend constexpr numeric_array floor(numeric_array const &rhs) noexcept
1057 {
1058 if (!std::is_constant_evaluated()) {
1059 if constexpr (is_f32x4 and x86_64_v2) {
1060 return numeric_array{_mm_floor_ps(rhs.reg())};
1061 }
1062 }
1063
1064 auto r = numeric_array{};
1065 for (size_t i = 0; i != N; ++i) {
1066 r[i] = std::floor(rhs.v[i]);
1067 }
1068 return r;
1069 }
1070
1071 [[nodiscard]] friend constexpr numeric_array ceil(numeric_array const &rhs) noexcept
1072 {
1073 if (!std::is_constant_evaluated()) {
1074 if constexpr (is_f32x4 and x86_64_v2) {
1075 return numeric_array{_mm_ceil_ps(rhs.reg())};
1076 }
1077 }
1078
1079 auto r = numeric_array{};
1080 for (size_t i = 0; i != N; ++i) {
1081 r[i] = std::ceil(rhs.v[i]);
1082 }
1083 return r;
1084 }
1085
1086 [[nodiscard]] friend constexpr numeric_array round(numeric_array const &rhs) noexcept
1087 {
1088 if (!std::is_constant_evaluated()) {
1089 if constexpr (is_f32x4 and x86_64_v2) {
1090 return numeric_array{_mm_round_ps(rhs.reg(), _MM_FROUND_CUR_DIRECTION)};
1091 }
1092 }
1093
1094 auto r = numeric_array{};
1095 for (size_t i = 0; i != N; ++i) {
1096 r[i] = std::round(rhs.v[i]);
1097 }
1098 return r;
1099 }
1100
1108 template<size_t Mask>
1109 [[nodiscard]] friend constexpr T dot(numeric_array const &lhs, numeric_array const &rhs) noexcept
1110 {
1111 if (!std::is_constant_evaluated()) {
1112 if constexpr (is_f32x4 and x86_64_v2) {
1113 return f32x4_x64v2_dot<Mask>(lhs.v, rhs.v);
1114 }
1115 }
1116
1117 auto r = T{};
1118 for (size_t i = 0; i != N; ++i) {
1119 if (static_cast<bool>(Mask & (1_uz << i))) {
1120 r += lhs.v[i] * rhs.v[i];
1121 }
1122 }
1123 return r;
1124 }
1125
1133 template<size_t Mask>
1134 [[nodiscard]] friend constexpr T hypot(numeric_array const &rhs) noexcept
1135 {
1136 if (is_f32x4 && x86_64_v2 && !std::is_constant_evaluated()) {
1137 return f32x4_x64v2_hypot<Mask>(rhs.v);
1138 }
1139 return std::sqrt(dot<Mask>(rhs, rhs));
1140 }
1141
1149 template<size_t Mask>
1150 [[nodiscard]] friend constexpr T squared_hypot(numeric_array const &rhs) noexcept
1151 {
1152 return dot<Mask>(rhs, rhs);
1153 }
1154
1161 template<size_t Mask>
1162 [[nodiscard]] friend constexpr T rcp_hypot(numeric_array const &rhs) noexcept
1163 {
1164 if (is_f32x4 && x86_64_v2 && !std::is_constant_evaluated()) {
1165 return f32x4_x64v2_rcp_hypot<Mask>(rhs.v);
1166 }
1167
1168 return 1.0f / hypot<Mask>(rhs);
1169 }
1170
1179 template<size_t Mask>
1180 [[nodiscard]] friend constexpr numeric_array normalize(numeric_array const &rhs) noexcept
1181 {
1182 tt_axiom(rhs.is_vector());
1183
1184 if (is_f32x4 && x86_64_v2 && !std::is_constant_evaluated()) {
1185 return numeric_array{f32x4_x64v2_normalize<Mask>(rhs.v)};
1186 }
1187
1188 ttlet rcp_hypot_ = rcp_hypot<Mask>(rhs);
1189
1190 auto r = numeric_array{};
1191 for (size_t i = 0; i != N; ++i) {
1192 if (static_cast<bool>(Mask & (1_uz << i))) {
1193 r.v[i] = rhs.v[i] * rcp_hypot_;
1194 }
1195 }
1196 return r;
1197 }
1198
1199 [[nodiscard]] friend constexpr unsigned int eq(numeric_array const &lhs, numeric_array const &rhs) noexcept
1200 requires(N <= sizeof(unsigned int) * CHAR_BIT)
1201 {
1202 if (!std::is_constant_evaluated()) {
1203 if constexpr (is_f32x4 and x86_64_v2) {
1204 return static_cast<unsigned int>(_mm_movemask_ps(_mm_cmpeq_ps(lhs.reg(), rhs.reg())));
1205 }
1206 }
1207
1208 unsigned int r = 0;
1209 for (size_t i = 0; i != N; ++i) {
1210 r |= static_cast<unsigned int>(lhs.v[i] == rhs.v[i]) << i;
1211 }
1212 return r;
1213 }
1214
1215 [[nodiscard]] friend constexpr unsigned int ne(numeric_array const &lhs, numeric_array const &rhs) noexcept
1216 requires(N <= sizeof(unsigned int) * CHAR_BIT)
1217 {
1218 if (!std::is_constant_evaluated()) {
1219 if constexpr (is_f32x4 and x86_64_v2) {
1220 return static_cast<unsigned int>(_mm_movemask_ps(_mm_cmpne_ps(lhs.reg(), rhs.reg())));
1221 }
1222 }
1223 unsigned int r = 0;
1224 for (size_t i = 0; i != N; ++i) {
1225 r |= static_cast<unsigned int>(lhs.v[i] != rhs.v[i]) << i;
1226 }
1227 return r;
1228 }
1229
1230 [[nodiscard]] friend constexpr unsigned int lt(numeric_array const &lhs, numeric_array const &rhs) noexcept
1231 requires(N <= sizeof(unsigned int) * CHAR_BIT)
1232 {
1233 if (!std::is_constant_evaluated()) {
1234 if constexpr (is_f32x4 and x86_64_v2) {
1235 return static_cast<unsigned int>(_mm_movemask_ps(_mm_cmplt_ps(lhs.reg(), rhs.reg())));
1236 }
1237 }
1238 unsigned int r = 0;
1239 for (size_t i = 0; i != N; ++i) {
1240 r |= static_cast<unsigned int>(lhs.v[i] < rhs.v[i]) << i;
1241 }
1242 return r;
1243 }
1244
1245 [[nodiscard]] friend constexpr unsigned int gt(numeric_array const &lhs, numeric_array const &rhs) noexcept
1246 requires(N <= sizeof(unsigned int) * CHAR_BIT)
1247 {
1248 if (!std::is_constant_evaluated()) {
1249 if constexpr (is_f32x4 and x86_64_v2) {
1250 return static_cast<unsigned int>(_mm_movemask_ps(_mm_cmpgt_ps(lhs.reg(), rhs.reg())));
1251 }
1252 }
1253 unsigned int r = 0;
1254 for (size_t i = 0; i != N; ++i) {
1255 r |= static_cast<unsigned int>(lhs.v[i] > rhs.v[i]) << i;
1256 }
1257 return r;
1258 }
1259
1260 [[nodiscard]] friend constexpr unsigned int le(numeric_array const &lhs, numeric_array const &rhs) noexcept
1261 requires(N <= sizeof(unsigned int) * CHAR_BIT)
1262 {
1263 if (!std::is_constant_evaluated()) {
1264 if constexpr (is_f32x4 and x86_64_v2) {
1265 return static_cast<unsigned int>(_mm_movemask_ps(_mm_cmple_ps(lhs.reg(), rhs.reg())));
1266 }
1267 }
1268 unsigned int r = 0;
1269 for (size_t i = 0; i != N; ++i) {
1270 r |= static_cast<unsigned int>(lhs.v[i] <= rhs.v[i]) << i;
1271 }
1272 return r;
1273 }
1274
1275 [[nodiscard]] friend constexpr unsigned int ge(numeric_array const &lhs, numeric_array const &rhs) noexcept
1276 requires(N <= sizeof(unsigned int) * CHAR_BIT)
1277 {
1278 if (!std::is_constant_evaluated()) {
1279 if constexpr (is_f32x4 and x86_64_v2) {
1280 return static_cast<unsigned int>(_mm_movemask_ps(_mm_cmpge_ps(lhs.reg(), rhs.reg())));
1281 }
1282 }
1283 unsigned int r = 0;
1284 for (size_t i = 0; i != N; ++i) {
1285 r |= static_cast<unsigned int>(lhs.v[i] >= rhs.v[i]) << i;
1286 }
1287 return r;
1288 }
1289
1290 [[nodiscard]] static constexpr value_type zero_mask() noexcept
1291 {
1292 std::array<unsigned char, sizeof(value_type)> bytes;
1293 for (size_t i = 0; i != bytes.size(); ++i) {
1294 bytes[i] = 0;
1295 }
1296 return std::bit_cast<value_type>(bytes);
1297 }
1298
1299 [[nodiscard]] static constexpr value_type ones_mask() noexcept
1300 {
1301 static_assert(CHAR_BIT == 8);
1302
1303 std::array<unsigned char, sizeof(value_type)> bytes;
1304 for (size_t i = 0; i != bytes.size(); ++i) {
1305 bytes[i] = 0xff;
1306 }
1307 return std::bit_cast<value_type>(bytes);
1308 }
1309
1310 [[nodiscard]] friend constexpr numeric_array gt_mask(numeric_array const &lhs, numeric_array const &rhs) noexcept
1311 {
1312 if (not std::is_constant_evaluated()) {
1313 if constexpr (is_f32x4 and x86_64_v2) {
1314 return numeric_array{_mm_cmpgt_ps(lhs.reg(), rhs.reg())};
1315 } else if constexpr (is_i64x4 and x86_64_v2) {
1316 return numeric_array{_mm_cmpgt_epi64(lhs.reg(), rhs.reg())};
1317 } else if constexpr (is_i32x4 and x86_64_v2) {
1318 return numeric_array{_mm_cmpgt_epi32(lhs.reg(), rhs.reg())};
1319 } else if constexpr (is_i16x4 and x86_64_v2) {
1320 return numeric_array{_mm_cmpgt_epi16(lhs.reg(), rhs.reg())};
1321 }
1322 }
1323
1324 constexpr value_type zero = zero_mask();
1325 constexpr value_type ones = ones_mask();
1326
1327 auto r = numeric_array{};
1328 for (size_t i = 0; i != N; ++i) {
1329 r[i] = lhs.v[i] > rhs.v[i] ? ones : zero;
1330 }
1331 return r;
1332 }
1333
1334
1335 [[nodiscard]] friend constexpr numeric_array ge_mask(numeric_array const &lhs, numeric_array const &rhs) noexcept
1336 {
1337 if (not std::is_constant_evaluated()) {
1338 if constexpr (is_f32x4 and x86_64_v2) {
1339 return numeric_array{_mm_cmpge_ps(lhs.reg(), rhs.reg())};
1340 }
1341 }
1342
1343 constexpr value_type zero = zero_mask();
1344 constexpr value_type ones = ones_mask();
1345
1346 auto r = numeric_array{};
1347 for (size_t i = 0; i != N; ++i) {
1348 r[i] = lhs.v[i] >= rhs.v[i] ? ones : zero;
1349 }
1350 return r;
1351 }
1352
1353 [[nodiscard]] friend constexpr bool operator==(numeric_array const &lhs, numeric_array const &rhs) noexcept
1354 {
1355 if (!std::is_constant_evaluated()) {
1356 if constexpr (is_f32x4 && x86_64_v2) {
1357 // MSVC cannot vectorize comparison.
1358 return f32x4_x64v2_eq(lhs.v, rhs.v);
1359 }
1360 }
1361
1362 auto r = true;
1363 for (size_t i = 0; i != N; ++i) {
1364 r &= (lhs.v[i] == rhs.v[i]);
1365 }
1366 return r;
1367 }
1368
1369 [[nodiscard]] friend constexpr bool operator!=(numeric_array const &lhs, numeric_array const &rhs) noexcept
1370 {
1371 return !(lhs == rhs);
1372 }
1373
1374 [[nodiscard]] friend constexpr numeric_array operator<<(numeric_array const &lhs, unsigned int rhs) noexcept
1375 {
1376 if (not std::is_constant_evaluated()) {
1377 if constexpr (x86_64_v2 and is_i64x2) {
1378 return numeric_array{_mm_slli_epi64(lhs.reg(), rhs)};
1379 } else if constexpr (x86_64_v2 and is_i32x4) {
1380 return numeric_array{_mm_slli_epi32(lhs.reg(), rhs)};
1381 } else if constexpr (x86_64_v2 and is_i16x8) {
1382 return numeric_array{_mm_slli_epi32(lhs.reg(), rhs)};
1383 } else if constexpr (x86_64_v2 and is_u64x2) {
1384 return numeric_array{_mm_slli_epi64(lhs.reg(), rhs)};
1385 } else if constexpr (x86_64_v2 and is_u32x4) {
1386 return numeric_array{_mm_slli_epi32(lhs.reg(), rhs)};
1387 } else if constexpr (x86_64_v2 and is_u16x8) {
1388 return numeric_array{_mm_slli_epi32(lhs.reg(), rhs)};
1389 }
1390 }
1391
1392 auto r = numeric_array{};
1393 for (size_t i = 0; i != N; ++i) {
1394 r.v[i] = lhs.v[i] << rhs;
1395 }
1396 return r;
1397 }
1398
1399 [[nodiscard]] friend constexpr numeric_array operator>>(numeric_array const &lhs, unsigned int rhs) noexcept
1400 {
1401 if (not std::is_constant_evaluated()) {
1402 if constexpr (x86_64_v2 and is_i32x4) {
1403 return numeric_array{_mm_srai_epi32(lhs.reg(), rhs)};
1404 } else if constexpr (x86_64_v2 and is_i16x8) {
1405 return numeric_array{_mm_srai_epi16(lhs.reg(), rhs)};
1406 } else if constexpr (x86_64_v2 and is_u64x2) {
1407 return numeric_array{_mm_srli_epi64(lhs.reg(), rhs)};
1408 } else if constexpr (x86_64_v2 and is_u32x4) {
1409 return numeric_array{_mm_srli_epi32(lhs.reg(), rhs)};
1410 } else if constexpr (x86_64_v2 and is_u16x8) {
1411 return numeric_array{_mm_srli_epi16(lhs.reg(), rhs)};
1412 }
1413 }
1414
1415 auto r = numeric_array{};
1416 for (size_t i = 0; i != N; ++i) {
1417 r.v[i] = lhs.v[i] >> rhs;
1418 }
1419 return r;
1420 }
1421
1422 [[nodiscard]] friend constexpr numeric_array operator|(numeric_array const &lhs, numeric_array const &rhs) noexcept
1423 {
1424 if (!std::is_constant_evaluated()) {
1425 if constexpr (std::is_integral_v<T> and x86_64_v2) {
1426 return numeric_array{_mm_or_si128(lhs.reg(), rhs.reg())};
1427 }
1428 }
1429 auto r = numeric_array{};
1430 for (size_t i = 0; i != N; ++i) {
1431 r.v[i] = lhs.v[i] | rhs.v[i];
1432 }
1433 return r;
1434 }
1435
1436 [[nodiscard]] friend constexpr numeric_array operator|(numeric_array const &lhs, T const &rhs) noexcept
1437 {
1438 return lhs | broadcast(rhs);
1439 }
1440
1441 [[nodiscard]] friend constexpr numeric_array operator|(T const &lhs, numeric_array const &rhs) noexcept
1442 {
1443 return broadcast(lhs) | rhs;
1444 }
1445
1446 [[nodiscard]] friend constexpr numeric_array operator&(numeric_array const &lhs, numeric_array const &rhs) noexcept
1447 {
1448 if (!std::is_constant_evaluated()) {
1449 if constexpr (std::is_integral_v<T> and x86_64_v2) {
1450 return numeric_array{_mm_and_si128(lhs.reg(), rhs.reg())};
1451 }
1452 }
1453 auto r = numeric_array{};
1454 for (size_t i = 0; i != N; ++i) {
1455 r.v[i] = lhs.v[i] & rhs.v[i];
1456 }
1457 return r;
1458 }
1459
1460 [[nodiscard]] friend constexpr numeric_array operator&(numeric_array const &lhs, T const &rhs) noexcept
1461 {
1462 return lhs & broadcast(rhs);
1463 }
1464
1465 [[nodiscard]] friend constexpr numeric_array operator&(T const &lhs, numeric_array const &rhs) noexcept
1466 {
1467 return broadcast(lhs) & rhs;
1468 }
1469
1470 [[nodiscard]] friend constexpr numeric_array operator^(numeric_array const &lhs, numeric_array const &rhs) noexcept
1471 {
1472 if (!std::is_constant_evaluated()) {
1473 if constexpr (std::is_integral_v<T> and x86_64_v2) {
1474 return numeric_array{_mm_xor_si128(lhs.reg(), rhs.reg())};
1475 }
1476 }
1477 auto r = numeric_array{};
1478 for (size_t i = 0; i != N; ++i) {
1479 r.v[i] = lhs.v[i] ^ rhs.v[i];
1480 }
1481 return r;
1482 }
1483
1484 [[nodiscard]] friend constexpr numeric_array operator^(numeric_array const &lhs, T const &rhs) noexcept
1485 {
1486 return lhs ^ broadcast(rhs);
1487 }
1488
1489 [[nodiscard]] friend constexpr numeric_array operator^(T const &lhs, numeric_array const &rhs) noexcept
1490 {
1491 return broadcast(lhs) ^ rhs;
1492 }
1493
1494 [[nodiscard]] friend constexpr numeric_array operator+(numeric_array const &lhs, numeric_array const &rhs) noexcept
1495 {
1496 if (!std::is_constant_evaluated()) {
1497 if constexpr (x86_64_v2_5 and lhs.is_f32x8 and rhs.is_f32x8) {
1498 return numeric_array{_mm256_add_ps(lhs.reg(), rhs.reg())};
1499 }
1500 }
1501
1502 auto r = numeric_array{};
1503 for (size_t i = 0; i != N; ++i) {
1504 r.v[i] = lhs.v[i] + rhs.v[i];
1505 }
1506 return r;
1507 }
1508
1509 [[nodiscard]] friend constexpr numeric_array operator+(numeric_array const &lhs, T const &rhs) noexcept
1510 {
1511 return lhs + broadcast(rhs);
1512 }
1513
1514 [[nodiscard]] friend constexpr numeric_array operator+(T const &lhs, numeric_array const &rhs) noexcept
1515 {
1516 return broadcast(lhs) + rhs;
1517 }
1518
1519 [[nodiscard]] friend constexpr numeric_array hadd(numeric_array const &lhs, numeric_array const &rhs) noexcept
1520 {
1521 if (!std::is_constant_evaluated()) {
1522 if constexpr (is_f64x2 and x86_64_v2) {
1523 return numeric_array{_mm_hadd_pd(lhs.reg(), rhs.reg())};
1524 } else if constexpr (is_f32x4 and x86_64_v2) {
1525 return numeric_array{_mm_hadd_ps(lhs.reg(), rhs.reg())};
1526 } else if constexpr (is_i32x4 and x86_64_v2) {
1527 return numeric_array{_mm_hadd_epi32(lhs.reg(), rhs.reg())};
1528 } else if constexpr (is_i16x8 and x86_64_v2) {
1529 return numeric_array{_mm_hadd_epi16(lhs.reg(), rhs.reg())};
1530 } else if constexpr (is_i8x16 and x86_64_v2) {
1531 return numeric_array{_mm_hadd_epi8(lhs.reg(), rhs.reg())};
1532 }
1533 }
1534
1535 tt_axiom(N % 2 == 0);
1536
1537 auto r = numeric_array{};
1538
1539 size_t src_i = 0;
1540 size_t dst_i = 0;
1541 while (src_i != N) {
1542 auto tmp = lhs[src_i++];
1543 tmp += lhs[src_i++];
1544 r.v[dst_i++] = tmp;
1545 }
1546
1547 src_i = 0;
1548 while (src_i != N) {
1549 auto tmp = rhs[src_i++];
1550 tmp += rhs[src_i++];
1551 r.v[dst_i++] = tmp;
1552 }
1553 return r;
1554 }
1555
1556 [[nodiscard]] friend constexpr numeric_array hsub(numeric_array const &lhs, numeric_array const &rhs) noexcept
1557 {
1558 if (!std::is_constant_evaluated()) {
1559 if constexpr (is_f64x2 and x86_64_v2) {
1560 return numeric_array{_mm_hsub_pd(lhs.reg(), rhs.reg())};
1561 } else if constexpr (is_f32x4 and x86_64_v2) {
1562 return numeric_array{_mm_hsub_ps(lhs.reg(), rhs.reg())};
1563 } else if constexpr (is_i32x4 and x86_64_v2) {
1564 return numeric_array{_mm_hsub_epi32(lhs.reg(), rhs.reg())};
1565 } else if constexpr (is_i16x8 and x86_64_v2) {
1566 return numeric_array{_mm_hsub_epi16(lhs.reg(), rhs.reg())};
1567 } else if constexpr (is_i8x16 and x86_64_v2) {
1568 return numeric_array{_mm_hsub_epi8(lhs.reg(), rhs.reg())};
1569 }
1570 }
1571
1572 tt_axiom(N % 2 == 0);
1573
1574 auto r = numeric_array{};
1575
1576 size_t src_i = 0;
1577 size_t dst_i = 0;
1578 while (src_i != N) {
1579 auto tmp = lhs[src_i++];
1580 tmp -= lhs[src_i++];
1581 r.v[dst_i++] = tmp;
1582 }
1583
1584 src_i = 0;
1585 while (src_i != N) {
1586 auto tmp = rhs[src_i++];
1587 tmp -= rhs[src_i++];
1588 r.v[dst_i++] = tmp;
1589 }
1590 return r;
1591 }
1592
1593 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const &lhs, numeric_array const &rhs) noexcept
1594 {
1595 if (!std::is_constant_evaluated()) {
1596 if constexpr (x86_64_v2_5 and lhs.is_f32x8 and rhs.is_f32x8) {
1597 return numeric_array{_mm256_sub_ps(lhs.reg(), rhs.reg())};
1598 }
1599 }
1600
1601 auto r = numeric_array{};
1602 for (size_t i = 0; i != N; ++i) {
1603 r.v[i] = lhs.v[i] - rhs.v[i];
1604 }
1605 return r;
1606 }
1607
1608 [[nodiscard]] friend constexpr numeric_array operator-(numeric_array const &lhs, T const &rhs) noexcept
1609 {
1610 return lhs - broadcast(rhs);
1611 }
1612
1613 [[nodiscard]] friend constexpr numeric_array operator-(T const &lhs, numeric_array const &rhs) noexcept
1614 {
1615 return broadcast(lhs) - rhs;
1616 }
1617
1622 template<size_t Mask = ~size_t{0}>
1623 [[nodiscard]] friend constexpr numeric_array addsub(numeric_array const &lhs, numeric_array const &rhs) noexcept
1624 {
1625 if (!std::is_constant_evaluated()) {
1626 if constexpr (is_f32x4 && x86_64_v2) {
1627 return numeric_array{f32x4_x64v2_addsub<Mask & 0xf>(lhs.v, rhs.v)};
1628 }
1629 }
1630
1631 auto r = numeric_array{};
1632 for (size_t i = 0; i != N; ++i) {
1633 if (static_cast<bool>((Mask >> i) & 1)) {
1634 r.v[i] = lhs.v[i] + rhs.v[i];
1635 } else {
1636 r.v[i] = lhs.v[i] - rhs.v[i];
1637 }
1638 }
1639 return r;
1640 }
1641
1642 [[nodiscard]] friend constexpr numeric_array operator*(numeric_array const &lhs, numeric_array const &rhs) noexcept
1643 {
1644 if (!std::is_constant_evaluated()) {
1645 if constexpr (x86_64_v2_5 and lhs.is_f32x8 and rhs.is_f32x8) {
1646 return numeric_array{_mm256_mul_ps(lhs.reg(), rhs.reg())};
1647 }
1648 }
1649
1650 auto r = numeric_array{};
1651 for (size_t i = 0; i != N; ++i) {
1652 r.v[i] = lhs.v[i] * rhs.v[i];
1653 }
1654 return r;
1655 }
1656
1657 [[nodiscard]] friend constexpr numeric_array operator*(numeric_array const &lhs, T const &rhs) noexcept
1658 {
1659 return lhs * broadcast(rhs);
1660 }
1661
1662 [[nodiscard]] friend constexpr numeric_array operator*(T const &lhs, numeric_array const &rhs) noexcept
1663 {
1664 return broadcast(lhs) * rhs;
1665 }
1666
1667 [[nodiscard]] friend constexpr numeric_array operator/(numeric_array const &lhs, numeric_array const &rhs) noexcept
1668 {
1669 if (!std::is_constant_evaluated()) {
1670 if constexpr (x86_64_v2_5 and lhs.is_f32x8 and rhs.is_f32x8) {
1671 return numeric_array{_mm256_div_ps(lhs.reg(), rhs.reg())};
1672 }
1673 }
1674
1675 auto r = numeric_array{};
1676 for (size_t i = 0; i != N; ++i) {
1677 r.v[i] = lhs.v[i] / rhs.v[i];
1678 }
1679 return r;
1680 }
1681
1682 [[nodiscard]] friend constexpr numeric_array operator/(numeric_array const &lhs, T const &rhs) noexcept
1683 {
1684 return lhs / broadcast(rhs);
1685 }
1686
1687 [[nodiscard]] friend constexpr numeric_array operator/(T const &lhs, numeric_array const &rhs) noexcept
1688 {
1689 return broadcast(lhs) / rhs;
1690 }
1691
1692 [[nodiscard]] friend constexpr numeric_array operator%(numeric_array const &lhs, numeric_array const &rhs) noexcept
1693 {
1694 auto r = numeric_array{};
1695 for (size_t i = 0; i != N; ++i) {
1696 r.v[i] = lhs.v[i] % rhs.v[i];
1697 }
1698 return r;
1699 }
1700
1701 [[nodiscard]] friend constexpr numeric_array operator%(numeric_array const &lhs, T const &rhs) noexcept
1702 {
1703 return lhs % broadcast(rhs);
1704 }
1705
1706 [[nodiscard]] friend constexpr numeric_array operator%(T const &lhs, numeric_array const &rhs) noexcept
1707 {
1708 return broadcast(lhs) % rhs;
1709 }
1710
1711 [[nodiscard]] friend constexpr numeric_array min(numeric_array const &lhs, numeric_array const &rhs) noexcept
1712 {
1713 auto r = numeric_array{};
1714 for (size_t i = 0; i != N; ++i) {
1715 // std::min() causes vectorization failure with msvc
1716 r.v[i] = lhs.v[i] < rhs.v[i] ? lhs.v[i] : rhs.v[i];
1717 }
1718 return r;
1719 }
1720
1721 [[nodiscard]] friend constexpr numeric_array max(numeric_array const &lhs, numeric_array const &rhs) noexcept
1722 {
1723 auto r = numeric_array{};
1724 for (size_t i = 0; i != N; ++i) {
1725 // std::max() causes vectorization failure with msvc
1726 r.v[i] = lhs.v[i] > rhs.v[i] ? lhs.v[i] : rhs.v[i];
1727 }
1728 return r;
1729 }
1730
1731 [[nodiscard]] friend constexpr numeric_array
1732 clamp(numeric_array const &lhs, numeric_array const &low, numeric_array const &high) noexcept
1733 {
1734 auto r = numeric_array{};
1735 for (size_t i = 0; i != N; ++i) {
1736 // std::clamp() causes vectorization failure with msvc
1737 r.v[i] = lhs.v[i] < low.v[i] ? low.v[i] : lhs.v[i] > high.v[i] ? high.v[i] : lhs.v[i];
1738 }
1739 return r;
1740 }
1741
1744 [[nodiscard]] friend constexpr numeric_array cross_2D(numeric_array const &rhs) noexcept requires(N >= 2)
1745 {
1746 tt_axiom(rhs.z() == 0.0f && rhs.is_vector());
1747 return numeric_array{-rhs.y(), rhs.x()};
1748 }
1749
1752 [[nodiscard]] friend constexpr numeric_array normal_2D(numeric_array const &rhs) noexcept requires(N >= 2)
1753 {
1754 return normalize<0b0011>(cross_2D(rhs));
1755 }
1756
1759 [[nodiscard]] friend constexpr float cross_2D(numeric_array const &lhs, numeric_array const &rhs) noexcept requires(N >= 2)
1760 {
1761 if (is_f32x4 && x86_64_v2 && !std::is_constant_evaluated()) {
1762 return f32x4_x64v2_viktor_cross(lhs.v, rhs.v);
1763
1764 } else {
1765 return lhs.x() * rhs.y() - lhs.y() * rhs.x();
1766 }
1767 }
1768
1769 // x=a.y*b.z - a.z*b.y
1770 // y=a.z*b.x - a.x*b.z
1771 // z=a.x*b.y - a.y*b.x
1772 // w=a.w*b.w - a.w*b.w
1773 [[nodiscard]] constexpr friend numeric_array cross_3D(numeric_array const &lhs, numeric_array const &rhs) noexcept
1774 {
1775 if (!std::is_constant_evaluated()) {
1776 if constexpr (is_f32x4 && x86_64_v2) {
1777 return numeric_array{f32x4_x64v2_cross(lhs.v, rhs.v)};
1778 }
1779 }
1780
1781 return numeric_array{
1782 lhs.y() * rhs.z() - lhs.z() * rhs.y(),
1783 lhs.z() * rhs.x() - lhs.x() * rhs.z(),
1784 lhs.x() * rhs.y() - lhs.y() * rhs.x(),
1785 0.0f};
1786 }
1787
1788 // w + x*i + y*j + z*k
1789 //
1790 // (w1*x2 + x1*w2 + y1*z2 - z1*y2)i
1791 // + (w1*y2 - x1*z2 + y1*w2 + z1*x2)j
1792 // + (w1*z2 + x1*y2 - y1*x2 + z1*w2)k
1793 // + (w1*w2 - x1*x2 - y1*y2 - z1*z2)
1794 template<int D>
1795 requires(D == 4) [[nodiscard]] friend numeric_array
1796 hamilton_cross(numeric_array const &lhs, numeric_array const &rhs) noexcept
1797 {
1798 ttlet col0 = lhs.wwww() * rhs;
1799 ttlet col1 = lhs.xxxx() * rhs.wzyx();
1800 ttlet col2 = lhs.yyyy() * rhs.zwxy();
1801 ttlet col3 = lhs.zzzz() * rhs.yxwz();
1802
1803 ttlet col01 = addsub(col0, col1);
1804 ttlet col012 = addsub(col01.xzyw(), col2.xzyw()).xzyw();
1805
1806 return numeric_array{
1807
1808 };
1809 }
1810
1814 [[nodiscard]] friend constexpr numeric_array shift_left(numeric_array const &lhs, unsigned int rhs) noexcept
1815 {
1816 numeric_array r;
1817 for (ssize_t i = 0; i != N; ++i) {
1818 if ((i - rhs) >= 0) {
1819 r[i] = lhs[i - rhs];
1820 } else {
1821 r[i] = T{};
1822 }
1823 }
1824 return r;
1825 }
1826
1830 [[nodiscard]] friend constexpr numeric_array shift_right(numeric_array const &lhs, unsigned int rhs) noexcept
1831 {
1832 numeric_array r;
1833 for (ssize_t i = 0; i != N; ++i) {
1834 if ((i + rhs) < N) {
1835 r[i] = lhs[i + rhs];
1836 } else {
1837 r[i] = T{};
1838 }
1839 }
1840 return r;
1841 }
1842
1843 [[nodiscard]] friend constexpr numeric_array
1844 blend(numeric_array const &a, numeric_array const &b, numeric_array const &mask) requires(is_i8x16)
1845 {
1846 if (!std::is_constant_evaluated()) {
1847 if constexpr (x86_64_v2) {
1848 return numeric_array{_mm_blendv_epi8(a.reg(), b.reg(), mask.reg())};
1849 }
1850 }
1851
1852 auto r = numeric_array{};
1853
1854 for (size_t i = 0; i != N; ++i) {
1855 r[i] = mask[i] >= 0 ? a[i] : b[i];
1856 }
1857
1858 return r;
1859 }
1860
1861 [[nodiscard]] static constexpr numeric_array byte_srl_shuffle_indices(unsigned int rhs) requires(is_i8x16)
1862 {
1863 static_assert(std::endian::native == std::endian::little);
1864
1865 auto r = numeric_array{};
1866 for (auto i = 0; i != 16; ++i) {
1867 if ((i + rhs) < 16) {
1868 r[i] = narrow_cast<int8_t>(i + rhs);
1869 } else {
1870 // Indices set to -1 result in a zero after a byte shuffle.
1871 r[i] = -1;
1872 }
1873 }
1874 return r;
1875 }
1876
1877 [[nodiscard]] static constexpr numeric_array byte_sll_shuffle_indices(unsigned int rhs) requires(is_i8x16)
1878 {
1879 static_assert(std::endian::native == std::endian::little);
1880
1881 auto r = numeric_array{};
1882 for (auto i = 0; i != 16; ++i) {
1883 if ((i - rhs) >= 0) {
1884 r[i] = narrow_cast<int8_t>(i - rhs);
1885 } else {
1886 // Indices set to -1 result in a zero after a byte shuffle.
1887 r[i] = -1;
1888 }
1889 }
1890 return r;
1891 }
1892
1895 [[nodiscard]] friend constexpr numeric_array shuffle(numeric_array const &lhs, numeric_array const &rhs) requires(is_i8x16)
1896 {
1897 if (!std::is_constant_evaluated()) {
1898 if constexpr (x86_64_v2) {
1899 return numeric_array{_mm_shuffle_epi8(lhs.reg(), rhs.reg())};
1900 }
1901 }
1902
1903 auto r = numeric_array{};
1904
1905 for (size_t i = 0; i != N; ++i) {
1906 if (rhs[i] >= 0) {
1907 r[i] = lhs[rhs[i] & 0xf];
1908 } else {
1909 r[i] = 0;
1910 }
1911 }
1912
1913 return r;
1914 }
1915
1918 [[nodiscard]] friend constexpr numeric_array midpoint(numeric_array const &p1, numeric_array const &p2) noexcept
1919 {
1920 tt_axiom(p1.is_point());
1921 tt_axiom(p2.is_point());
1922 return (p1 + p2) * 0.5f;
1923 }
1924
1927 [[nodiscard]] friend constexpr numeric_array reflect_point(numeric_array const &p, numeric_array const anchor) noexcept
1928 {
1929 tt_axiom(p.is_point());
1930 tt_axiom(anchor.is_point());
1931 return anchor - (p - anchor);
1932 }
1933
1934 template<typename... Columns>
1935 [[nodiscard]] friend constexpr std::array<numeric_array, N> transpose(Columns const &...columns) noexcept
1936 {
1937 static_assert(sizeof...(Columns) == N, "Can only transpose square matrices");
1938
1940
1941 if (is_f32x4 && x86_64_v2 && !std::is_constant_evaluated()) {
1942 auto tmp = f32x4_x64v2_transpose(columns.v...);
1943 for (int i = 0; i != N; ++i) {
1944 r[i] = numeric_array{tmp[i]};
1945 }
1946
1947 } else {
1948 transpose_detail<0, Columns...>(columns..., r);
1949 }
1950
1951 return r;
1952 }
1953
1954 [[nodiscard]] constexpr friend numeric_array composit(numeric_array const &under, numeric_array const &over) noexcept
1955 requires(N == 4 && std::is_floating_point_v<T>)
1956 {
1957 if (over.is_transparent()) {
1958 return under;
1959 }
1960 if (over.is_opaque()) {
1961 return over;
1962 }
1963
1964 ttlet over_alpha = over.wwww();
1965 ttlet under_alpha = under.wwww();
1966
1967 ttlet over_color = over.xyz1();
1968 ttlet under_color = under.xyz1();
1969
1970 ttlet output_color = over_color * over_alpha + under_color * under_alpha * (T{1} - over_alpha);
1971
1972 return output_color / output_color.www1();
1973 }
1974
1975 [[nodiscard]] friend std::string to_string(numeric_array const &rhs) noexcept
1976 {
1977 auto r = std::string{};
1978
1979 r += '(';
1980 for (size_t i = 0; i != N; ++i) {
1981 if (i != 0) {
1982 r += "; ";
1983 }
1984 r += std::format("{}", rhs[i]);
1985 }
1986 r += ')';
1987 return r;
1988 }
1989
1990 friend std::ostream &operator<<(std::ostream &lhs, numeric_array const &rhs)
1991 {
1992 return lhs << to_string(rhs);
1993 }
1994
1999 template<size_t FromElement, size_t ToElement, size_t ZeroMask = 0>
2000 [[nodiscard]] constexpr friend numeric_array insert(numeric_array const &lhs, numeric_array const &rhs)
2001 {
2002 auto r = numeric_array{};
2003
2004 if (!std::is_constant_evaluated()) {
2005 if constexpr (is_f32x4 && x86_64_v2) {
2006 return numeric_array{f32x4_x64v2_insert<FromElement, ToElement, ZeroMask>(lhs.v, rhs.v)};
2007 } else if constexpr (is_u64x2 and x86_64_v2) {
2008 return numeric_array{u64x2_x64v2_insert<FromElement, ToElement, ZeroMask>(lhs.v, rhs.v)};
2009 }
2010 }
2011
2012 for (size_t i = 0; i != N; ++i) {
2013 if ((ZeroMask >> i) & 1) {
2014 r[i] = T{};
2015 } else if (i == ToElement) {
2016 r[i] = rhs[FromElement];
2017 } else {
2018 r[i] = lhs[i];
2019 }
2020 }
2021
2022 return r;
2023 }
2024
2032 template<ssize_t... Elements>
2033 [[nodiscard]] constexpr numeric_array swizzle() const
2034 {
2035 static_assert(sizeof...(Elements) <= N);
2036
2037 if (!std::is_constant_evaluated()) {
2038 if constexpr (is_f32x4 && x86_64_v2) {
2039 return numeric_array{f32x4_x64v2_swizzle<Elements...>(v)};
2040 } else if constexpr (is_i32x4 && x86_64_v2) {
2041 return numeric_array{i32x4_x64v2_swizzle<Elements...>(v)};
2042 } else if constexpr (is_u32x4 && x86_64_v2) {
2043 return numeric_array{u32x4_x64v2_swizzle<Elements...>(v)};
2044 } else if constexpr (is_u64x2 and x86_64_v2) {
2045 return numeric_array{u64x2_x64v2_swizzle<Elements...>(v)};
2046 }
2047 }
2048
2049 auto r = numeric_array{};
2050 swizzle_detail<0, Elements...>(r);
2051 return r;
2052 }
2053
2054#define SWIZZLE(swizzle_name, D, ...) \
2055 [[nodiscard]] constexpr numeric_array swizzle_name() const noexcept requires(D == N) \
2056 { \
2057 return swizzle<__VA_ARGS__>(); \
2058 }
2059
2060#define SWIZZLE_4D_GEN1(name, ...) \
2061 SWIZZLE(name##0, 4, __VA_ARGS__, get_zero) \
2062 SWIZZLE(name##1, 4, __VA_ARGS__, get_one) \
2063 SWIZZLE(name##x, 4, __VA_ARGS__, 0) \
2064 SWIZZLE(name##y, 4, __VA_ARGS__, 1) \
2065 SWIZZLE(name##z, 4, __VA_ARGS__, 2) \
2066 SWIZZLE(name##w, 4, __VA_ARGS__, 3)
2067
2068#define SWIZZLE_4D_GEN2(name, ...) \
2069 SWIZZLE_4D_GEN1(name##0, __VA_ARGS__, get_zero) \
2070 SWIZZLE_4D_GEN1(name##1, __VA_ARGS__, get_one) \
2071 SWIZZLE_4D_GEN1(name##x, __VA_ARGS__, 0) \
2072 SWIZZLE_4D_GEN1(name##y, __VA_ARGS__, 1) \
2073 SWIZZLE_4D_GEN1(name##z, __VA_ARGS__, 2) \
2074 SWIZZLE_4D_GEN1(name##w, __VA_ARGS__, 3)
2075
2076#define SWIZZLE_4D_GEN3(name, ...) \
2077 SWIZZLE_4D_GEN2(name##0, __VA_ARGS__, get_zero) \
2078 SWIZZLE_4D_GEN2(name##1, __VA_ARGS__, get_one) \
2079 SWIZZLE_4D_GEN2(name##x, __VA_ARGS__, 0) \
2080 SWIZZLE_4D_GEN2(name##y, __VA_ARGS__, 1) \
2081 SWIZZLE_4D_GEN2(name##z, __VA_ARGS__, 2) \
2082 SWIZZLE_4D_GEN2(name##w, __VA_ARGS__, 3)
2083
2084 SWIZZLE_4D_GEN3(_0, get_zero)
2085 SWIZZLE_4D_GEN3(_1, get_one)
2086 SWIZZLE_4D_GEN3(x, 0)
2087 SWIZZLE_4D_GEN3(y, 1)
2088 SWIZZLE_4D_GEN3(z, 2)
2089 SWIZZLE_4D_GEN3(w, 3)
2090
2091#define SWIZZLE_3D_GEN1(name, ...) \
2092 SWIZZLE(name##0, 3, __VA_ARGS__, get_zero) \
2093 SWIZZLE(name##1, 3, __VA_ARGS__, get_one) \
2094 SWIZZLE(name##x, 3, __VA_ARGS__, 0) \
2095 SWIZZLE(name##y, 3, __VA_ARGS__, 1) \
2096 SWIZZLE(name##z, 3, __VA_ARGS__, 2)
2097
2098#define SWIZZLE_3D_GEN2(name, ...) \
2099 SWIZZLE_3D_GEN1(name##0, __VA_ARGS__, get_zero) \
2100 SWIZZLE_3D_GEN1(name##1, __VA_ARGS__, get_one) \
2101 SWIZZLE_3D_GEN1(name##x, __VA_ARGS__, 0) \
2102 SWIZZLE_3D_GEN1(name##y, __VA_ARGS__, 1) \
2103 SWIZZLE_3D_GEN1(name##z, __VA_ARGS__, 2)
2104
2105 SWIZZLE_3D_GEN2(_0, get_zero)
2106 SWIZZLE_3D_GEN2(_1, get_one)
2107 SWIZZLE_3D_GEN2(x, 0)
2108 SWIZZLE_3D_GEN2(y, 1)
2109 SWIZZLE_3D_GEN2(z, 2)
2110
2111#define SWIZZLE_2D_GEN1(name, ...) \
2112 SWIZZLE(name##0, 2, __VA_ARGS__, get_zero) \
2113 SWIZZLE(name##1, 2, __VA_ARGS__, get_one) \
2114 SWIZZLE(name##x, 2, __VA_ARGS__, 0) \
2115 SWIZZLE(name##y, 2, __VA_ARGS__, 1)
2116
2117 SWIZZLE_2D_GEN1(_0, get_zero)
2118 SWIZZLE_2D_GEN1(_1, get_one)
2119 SWIZZLE_2D_GEN1(x, 0)
2120 SWIZZLE_2D_GEN1(y, 1)
2121
2122#undef SWIZZLE
2123#undef SWIZZLE_4D_GEN1
2124#undef SWIZZLE_4D_GEN2
2125#undef SWIZZLE_4D_GEN3
2126#undef SWIZZLE_3D_GEN1
2127#undef SWIZZLE_3D_GEN2
2128#undef SWIZZLE_2D_GEN1
2129
2130
2131 template<int I, typename First, typename... Rest>
2132 friend constexpr void transpose_detail(First const &first, Rest const &...rest, std::array<numeric_array, N> &r) noexcept
2133 {
2134 for (size_t j = 0; j != N; ++j) {
2135 r[j][I] = first[j];
2136 }
2137
2138 if constexpr (sizeof...(Rest) != 0) {
2139 transpose_detail<I + 1, Rest...>(rest..., r);
2140 }
2141 }
2142
2143 template<ssize_t I, ssize_t FirstElement, ssize_t... RestElements>
2144 constexpr void swizzle_detail(numeric_array &r) const noexcept
2145 {
2146 static_assert(I < narrow_cast<ssize_t>(N));
2147 static_assert(FirstElement >= -2 && FirstElement < narrow_cast<ssize_t>(N), "Index out of bounds");
2148
2149 get<I>(r) = get<FirstElement>(*this);
2150 if constexpr (sizeof...(RestElements) != 0) {
2151 swizzle_detail<I + 1, RestElements...>(r);
2152 }
2153 }
2154};
2155
2156using i8x1 = numeric_array<int8_t, 1>;
2157using i8x2 = numeric_array<int8_t, 2>;
2158using i8x4 = numeric_array<int8_t, 4>;
2159using i8x8 = numeric_array<int8_t, 8>;
2160using i8x16 = numeric_array<int8_t, 16>;
2161using i8x32 = numeric_array<int8_t, 32>;
2162using i8x64 = numeric_array<int8_t, 64>;
2163
2164using u8x1 = numeric_array<uint8_t, 1>;
2165using u8x2 = numeric_array<uint8_t, 2>;
2166using u8x4 = numeric_array<uint8_t, 4>;
2167using u8x8 = numeric_array<uint8_t, 8>;
2168using u8x16 = numeric_array<uint8_t, 16>;
2169using u8x32 = numeric_array<uint8_t, 32>;
2170using u8x64 = numeric_array<uint8_t, 64>;
2171
2172using i16x1 = numeric_array<int16_t, 1>;
2173using i16x2 = numeric_array<int16_t, 2>;
2174using i16x4 = numeric_array<int16_t, 4>;
2175using i16x8 = numeric_array<int16_t, 8>;
2176using i16x16 = numeric_array<int16_t, 16>;
2177using i16x32 = numeric_array<int16_t, 32>;
2178
2179using u16x1 = numeric_array<uint16_t, 1>;
2180using u16x2 = numeric_array<uint16_t, 2>;
2181using u16x4 = numeric_array<uint16_t, 4>;
2182using u16x8 = numeric_array<uint16_t, 8>;
2183using u16x16 = numeric_array<uint16_t, 16>;
2184using u16x32 = numeric_array<uint16_t, 32>;
2185
2186using i32x1 = numeric_array<int32_t, 1>;
2187using i32x2 = numeric_array<int32_t, 2>;
2188using i32x4 = numeric_array<int32_t, 4>;
2189using i32x8 = numeric_array<int32_t, 8>;
2190using i32x16 = numeric_array<int32_t, 16>;
2191
2192using u32x1 = numeric_array<uint32_t, 1>;
2193using u32x2 = numeric_array<uint32_t, 2>;
2194using u32x4 = numeric_array<uint32_t, 4>;
2195using u32x8 = numeric_array<uint32_t, 8>;
2196using u32x16 = numeric_array<uint32_t, 16>;
2197
2198using f32x1 = numeric_array<float, 1>;
2199using f32x2 = numeric_array<float, 2>;
2200using f32x4 = numeric_array<float, 4>;
2201using f32x8 = numeric_array<float, 8>;
2202using f32x16 = numeric_array<float, 16>;
2203
2204using i64x1 = numeric_array<int64_t, 1>;
2205using i64x2 = numeric_array<int64_t, 2>;
2206using i64x4 = numeric_array<int64_t, 4>;
2207using i64x8 = numeric_array<int64_t, 8>;
2208
2209using u64x1 = numeric_array<uint64_t, 1>;
2210using u64x2 = numeric_array<uint64_t, 2>;
2211using u64x4 = numeric_array<uint64_t, 4>;
2212using u64x8 = numeric_array<uint64_t, 8>;
2213
2214using f64x1 = numeric_array<double, 1>;
2215using f64x2 = numeric_array<double, 2>;
2216using f64x4 = numeric_array<double, 4>;
2217using f64x8 = numeric_array<double, 8>;
2218
2219} // namespace tt
2220
2221namespace std {
2222template<class T, std::size_t N>
2223struct tuple_size<tt::numeric_array<T, N>> : std::integral_constant<std::size_t, N> {
2224};
2225
2226template<std::size_t I, class T, std::size_t N>
2227struct tuple_element<I, tt::numeric_array<T, N>> {
2228 using type = T;
2229};
2230
2231} // namespace std
STL namespace.
Definition numeric_array.hpp:44
friend constexpr T get(numeric_array &&rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:910
static constexpr numeric_array load(std::byte const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:538
friend constexpr numeric_array neg(numeric_array rhs) noexcept
Negate individual elements.
Definition numeric_array.hpp:971
friend constexpr T get(numeric_array const &rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:929
friend constexpr T squared_hypot(numeric_array const &rhs) noexcept
Take the squared length of the vector.
Definition numeric_array.hpp:1150
friend constexpr numeric_array cross_2D(numeric_array const &rhs) noexcept
Calculate the 2D normal on a 2D vector.
Definition numeric_array.hpp:1744
friend constexpr numeric_array reflect_point(numeric_array const &p, numeric_array const anchor) noexcept
Find the point on the other side and at the same distance of an anchor-point.
Definition numeric_array.hpp:1927
constexpr void store(std::byte *ptr) const noexcept
Store a numeric array into memory.
Definition numeric_array.hpp:576
friend constexpr numeric_array midpoint(numeric_array const &p1, numeric_array const &p2) noexcept
Find a point at the midpoint between two points.
Definition numeric_array.hpp:1918
friend constexpr numeric_array shift_right(numeric_array const &lhs, unsigned int rhs) noexcept
Shift the elements left.
Definition numeric_array.hpp:1830
friend constexpr T & get(numeric_array &rhs) noexcept
Get a element from the numeric array.
Definition numeric_array.hpp:898
static constexpr numeric_array interleave_lo(numeric_array a, numeric_array b) noexcept
Interleave the first words in both arrays.
Definition numeric_array.hpp:508
friend constexpr T dot(numeric_array const &lhs, numeric_array const &rhs) noexcept
Take a dot product.
Definition numeric_array.hpp:1109
friend constexpr T rcp_hypot(numeric_array const &rhs) noexcept
Take a reciprocal of the length.
Definition numeric_array.hpp:1162
friend constexpr numeric_array shuffle(numeric_array const &lhs, numeric_array const &rhs)
Shuffle a 16x byte array, using the indices from the right-hand-side.
Definition numeric_array.hpp:1895
friend constexpr T hypot(numeric_array const &rhs) noexcept
Take the length of the vector.
Definition numeric_array.hpp:1134
friend constexpr float cross_2D(numeric_array const &lhs, numeric_array const &rhs) noexcept
Calculate the cross-product between two 2D vectors.
Definition numeric_array.hpp:1759
friend constexpr numeric_array normal_2D(numeric_array const &rhs) noexcept
Calculate the 2D unit-normal on a 2D vector.
Definition numeric_array.hpp:1752
constexpr friend numeric_array insert(numeric_array const &lhs, numeric_array const &rhs)
Insert an element from rhs into the result.
Definition numeric_array.hpp:2000
constexpr numeric_array swizzle() const
swizzle around the elements of the numeric array.
Definition numeric_array.hpp:2033
friend constexpr numeric_array addsub(numeric_array const &lhs, numeric_array const &rhs) noexcept
Add or subtract individual elements.
Definition numeric_array.hpp:1623
static constexpr numeric_array load(std::byte const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:549
friend constexpr numeric_array normalize(numeric_array const &rhs) noexcept
Normalize a vector.
Definition numeric_array.hpp:1180
friend constexpr numeric_array shift_left(numeric_array const &lhs, unsigned int rhs) noexcept
Shift the elements left.
Definition numeric_array.hpp:1814
static constexpr numeric_array load(T const *ptr) noexcept
Load a numeric array from memory.
Definition numeric_array.hpp:560
friend constexpr numeric_array zero(numeric_array rhs) noexcept
Set individual elements to zero.
Definition numeric_array.hpp:947
Definition concepts.hpp:31
Definition concepts.hpp:34
Definition concepts.hpp:37
T back(T... args)
T begin(T... args)
T ceil(T... args)
T data(T... args)
T empty(T... args)
T end(T... args)
T floor(T... args)
T front(T... args)
T max_size(T... args)
T memcpy(T... args)
T round(T... args)
T size(T... args)
T sqrt(T... args)