HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
native_i8x16_sse2.hpp
1// Copyright Take Vos 2022, 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "native_simd_utility.hpp"
8#include "../utility/module.hpp"
9#include <array>
10#include <ostream>
11
12hi_warning_push();
13// C26472: Don't use a static_cast for arithmetic conversions.
14// This is a low level type.
15hi_warning_ignore_msvc(26472);
16
17namespace hi { inline namespace v1 {
18
19#ifdef HI_HAS_SSE2
20
36struct native_i8x16 {
37 using value_type = int8_t;
38 constexpr static size_t size = 4;
39 using register_type = __m128i;
40 using array_type = std::array<value_type, size>;
41
42 register_type v;
43
44 native_i8x16(native_i8x16 const&) noexcept = default;
45 native_i8x16(native_i8x16&&) noexcept = default;
46 native_i8x16& operator=(native_i8x16 const&) noexcept = default;
47 native_i8x16& operator=(native_i8x16&&) noexcept = default;
48
51 native_i8x16() noexcept : v(_mm_setzero_si128()) {}
52
53 [[nodiscard]] explicit native_i8x16(register_type other) noexcept : v(other) {}
54
55 [[nodiscard]] explicit operator register_type() const noexcept
56 {
57 return v;
58 }
59
79 [[nodiscard]] native_i8x16(
80 value_type a,
81 value_type b = value_type{0},
82 value_type c = value_type{0},
83 value_type d = value_type{0},
84 value_type e = value_type{0},
85 value_type f = value_type{0},
86 value_type g = value_type{0},
87 value_type h = value_type{0},
88 value_type i = value_type{0},
89 value_type j = value_type{0},
90 value_type k = value_type{0},
91 value_type l = value_type{0},
92 value_type m = value_type{0},
93 value_type n = value_type{0},
94 value_type o = value_type{0},
95 value_type p = value_type{0}
96 ) noexcept :
97 v(_mm_set_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a))
98 {
99 }
100
101 [[nodiscard]] explicit native_i8x16(value_type const *other) noexcept :
102 v(_mm_loadu_si128(reinterpret_cast<register_type const *>(other)))
103 {
104 }
105
106 void store(value_type *out) const noexcept
107 {
109 _mm_storeu_si128(reinterpret_cast<register_type *>(out), v);
110 }
111
112 [[nodiscard]] explicit native_i8x16(void const *other) noexcept : v(_mm_loadu_si128(static_cast<register_type const *>(other)))
113 {
114 }
115
116 void store(void *out) const noexcept
117 {
119 _mm_storeu_si128(static_cast<register_type *>(out), v);
120 }
121
122 [[nodiscard]] explicit native_i8x16(std::span<value_type const> other) noexcept
123 {
124 hi_axiom(other.size() >= size);
125 v = _mm_loadu_si128(reinterpret_cast<register_type const *>(other.data()));
126 }
127
128 void store(std::span<value_type> out) const noexcept
129 {
130 hi_axiom(out.size() >= size);
131 _mm_storeu_si128(reinterpret_cast<register_type *>(out.data()), v);
132 }
133
134 [[nodiscard]] explicit native_i8x16(array_type other) noexcept :
135 v(_mm_loadu_si128(reinterpret_cast<register_type const *>(other.data())))
136 {
137 }
138
139 [[nodiscard]] explicit operator array_type() const noexcept
140 {
141 auto r = array_type{};
142 _mm_storeu_si128(reinterpret_cast<register_type *>(r.data()), v);
143 return r;
144 }
145
146#ifdef AVX512F
147 [[nodiscard]] explicit native_i8x16(native_f32x16 const& a) noexcept;
148 [[nodiscard]] explicit native_i8x16(native_u32x16 const& a) noexcept;
149#endif
150
160 [[nodiscard]] static native_i8x16 broadcast(value_type a) noexcept
161 {
162 return native_i8x16{_mm_set1_epi8(a)};
163 }
164
186 [[nodiscard]] static native_i8x16 broadcast(native_i8x16 a) noexcept
187 {
188#ifdef HI_HAS_AVX2
189 return native_i8x16{_mm_broadcastb_epi8(a.v)};
190#elif HI_HAS_SSSE3
191 return native_i8x16{_mm_shuffle_epi8(a.v, _mm_setzero_si128())};
192#else
193 // Create a mask for 1 byte each 32 bit word, AND it with a.v.
194 auto tmp = _mm_undefined_si128();
195 tmp = _mm_cmpeq_epi32(tmp, tmp);
196 tmp = _mm_slli_epi32(tmp, 24);
197 tmp = _mm_and_si128(tmp, a.v);
198
199 // Broadcast the first byte to all the bytes in the first 32 bit word.
200 tmp = _mm_or_si128(tmp, _mm_slli_epi32(tmp, 8));
201 tmp = _mm_or_si128(tmp, _mm_slli_epi32(tmp,16));
202
203 // Broadcast the first 32 bit word to all 4 32 bit words.
204 tmp = _mm_shuffle_epi32(tmp, 0b00'00'00'00);
205 return native_i8x16{tmp};
206#endif
207 }
208
209 [[nodiscard]] static native_i8x16 ones() noexcept
210 {
211 hilet tmp = _mm_undefined_si128();
212 return native_i8x16{_mm_cmpeq_epi32(tmp, tmp)};
213 }
214
217 [[nodiscard]] size_t mask() const noexcept
218 {
219 return narrow_cast<size_t>(_mm_movemask_epi8(v));
220 }
221
222 [[nodiscard]] friend bool equal(native_i8x16 a, native_i8x16 b) noexcept
223 {
224 return (a == b).mask() == 0b1111'1111'1111'1111;
225 }
226
227 [[nodiscard]] friend native_i8x16 operator==(native_i8x16 a, native_i8x16 b) noexcept
228 {
229 return native_i8x16{_mm_cmpeq_epi8(a.v, b.v)};
230 }
231
232 [[nodiscard]] friend native_i8x16 operator!=(native_i8x16 a, native_i8x16 b) noexcept
233 {
234 return ~(a == b);
235 }
236
237 [[nodiscard]] friend native_i8x16 operator<(native_i8x16 a, native_i8x16 b) noexcept
238 {
239 return native_i8x16{_mm_cmplt_epi8(a.v, b.v)};
240 }
241
242 [[nodiscard]] friend native_i8x16 operator>(native_i8x16 a, native_i8x16 b) noexcept
243 {
244 return native_i8x16{_mm_cmpgt_epi8(a.v, b.v)};
245 }
246
247 [[nodiscard]] friend native_i8x16 operator<=(native_i8x16 a, native_i8x16 b) noexcept
248 {
249 return ~(a > b);
250 }
251
252 [[nodiscard]] friend native_i8x16 operator>=(native_i8x16 a, native_i8x16 b) noexcept
253 {
254 return ~(a < b);
255 }
256
257 [[nodiscard]] friend native_i8x16 operator+(native_i8x16 a) noexcept
258 {
259 return a;
260 }
261
262 [[nodiscard]] friend native_i8x16 operator-(native_i8x16 a) noexcept
263 {
264 return native_i8x16{} - a;
265 }
266
267 [[nodiscard]] friend native_i8x16 operator+(native_i8x16 a, native_i8x16 b) noexcept
268 {
269 return native_i8x16{_mm_add_epi8(a.v, b.v)};
270 }
271
272 [[nodiscard]] friend native_i8x16 operator-(native_i8x16 a, native_i8x16 b) noexcept
273 {
274 return native_i8x16{_mm_sub_epi8(a.v, b.v)};
275 }
276
277 [[nodiscard]] friend native_i8x16 operator&(native_i8x16 a, native_i8x16 b) noexcept
278 {
279 return native_i8x16{_mm_and_si128(a.v, b.v)};
280 }
281
282 [[nodiscard]] friend native_i8x16 operator|(native_i8x16 a, native_i8x16 b) noexcept
283 {
284 return native_i8x16{_mm_or_si128(a.v, b.v)};
285 }
286
287 [[nodiscard]] friend native_i8x16 operator^(native_i8x16 a, native_i8x16 b) noexcept
288 {
289 return native_i8x16{_mm_xor_si128(a.v, b.v)};
290 }
291
292 [[nodiscard]] friend native_i8x16 operator~(native_i8x16 a) noexcept
293 {
294 auto ones = _mm_undefined_si128();
295 ones = _mm_cmpeq_epi32(ones, ones);
296 return native_i8x16{_mm_andnot_si128(a.v, ones)};
297 }
298
299 [[nodiscard]] friend native_i8x16 min(native_i8x16 a, native_i8x16 b) noexcept
300 {
301#if HI_HAS_SSE4_1
302 return native_i8x16{_mm_min_epi8(a.v, b.v)};
303#else
304 hilet mask = a < b;
305 return (mask & a) | not_and(mask, b);
306#endif
307 }
308
309 [[nodiscard]] friend native_i8x16 max(native_i8x16 a, native_i8x16 b) noexcept
310 {
311#if HI_HAS_SSE4_1
312 return native_i8x16{_mm_max_epi8(a.v, b.v)};
313#else
314 hilet mask = a > b;
315 return (mask & a) | not_and(mask, b);
316#endif
317 }
318
319 [[nodiscard]] friend native_i8x16 abs(native_i8x16 a) noexcept
320 {
321#if HI_HAS_SSSE3
322 return native_i8x16{_mm_abs_epi8(a.v)};
323#else
324 hilet mask = a > native_i8x16{};
325 return (mask & a) | not_and(mask, -a);
326#endif
327 }
328
335 template<size_t Mask>
336 [[nodiscard]] friend native_i8x16 set_zero(native_i8x16 a) noexcept
337 {
338 static_assert(Mask <= 0b1111);
339#ifdef HI_HAS_SSE4_1
340 return native_i8x16{_mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(a.v), _mm_castsi128_ps(a.v), Mask))};
341#else
342 hilet mask = from_mask<Mask>();
343 return not_and(mask, a);
344#endif
345 }
346
354 template<size_t Index>
355 [[nodiscard]] friend native_i8x16 insert(native_i8x16 a, value_type b) noexcept
356 {
357 static_assert(Index < 4);
358
359#ifdef HI_HAS_SSE4_1
360 return native_i8x16{_mm_insert_epi8(a.v, b, Index)};
361#else
362 hilet mask = from_mask<1_uz << Index>();
363 return not_and(mask, a) | (mask & broadcast(b));
364#endif
365 }
366
373 template<size_t Index>
374 [[nodiscard]] friend value_type get(native_i8x16 a) noexcept
375 {
376#ifdef HI_HAS_SSE4_1
377 return static_cast<value_type>(_mm_extract_epi8(a.v, Index));
378#else
379 auto r = static_cast<array_type>(a);
380 return std::get<Index>(r);
381#endif
382 }
383
389 [[nodiscard]] friend native_i8x16 not_and(native_i8x16 a, native_i8x16 b) noexcept
390 {
391 return native_i8x16{_mm_andnot_si128(a.v, b.v)};
392 }
393
394 friend std::ostream& operator<<(std::ostream& a, native_i8x16 b) noexcept
395 {
396 return a << "(" << get<0>(b) << ", " << get<1>(b) << ", " << get<2>(b) << ", " << get<3>(b) << ")";
397 }
398};
399
400#endif
401
402}} // namespace hi::v1
403
404hi_warning_pop();
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:253
#define hi_axiom_not_null(expression,...)
Assert if an expression is not nullptr.
Definition assert.hpp:272
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
@ other
The gui_event does not have associated data.
DOXYGEN BUG.
Definition algorithm.hpp:13
geometry/margins.hpp
Definition cache.hpp:11
T equal(T... args)
T max(T... args)
T min(T... args)
T operator!=(T... args)