HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
array_intrinsic_f16x4_x86.hpp
1// Copyright Take Vos 2023.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "array_intrinsic.hpp"
8#include "float_to_half.hpp"
9#include "half.hpp"
10#include "macros.hpp"
11#include <cstddef>
12#include <array>
13#include <limits>
14
15#include <xmmintrin.h>
16#include <emmintrin.h>
17#include <pmmintrin.h>
18#include <tmmintrin.h>
19#include <smmintrin.h>
20#include <nmmintrin.h>
21#include <immintrin.h>
22
23hi_export_module(hikocpu : array_intrinsic_f16x4);
24
25hi_export namespace hi {
26inline namespace v1 {
27
28#if defined(HI_HAS_SSE2)
29template<>
30struct array_intrinsic<half, 4> {
31 using value_type = half;
32 using register_type = __m128i;
33 using array_type = std::array<half, 4>;
34
37 [[nodiscard]] hi_force_inline static register_type L(array_type a) noexcept
38 {
39 return _mm_set_epi64x(0, std::bit_cast<int64_t>(a));
40 }
41
44 [[nodiscard]] hi_force_inline static array_type S(register_type a) noexcept
45 {
46 return std::bit_cast<array_type>(_mm_cvtsi128_si64(a));
47 }
48
49
50 [[nodiscard]] hi_force_inline static array_type convert(std::array<float, 4> a) noexcept
51 {
52 if (has_f16c()) {
53 return std::bit_cast<array_type>(float_to_half_f16c(a));
54 } else {
55 return std::bit_cast<array_type>(float_to_half_sse2(a));
56 }
57 }
58
59 [[nodiscard]] hi_force_inline static array_type undefined() noexcept
60 {
61 return S(_mm_undefined_si128());
62 }
63
64 [[nodiscard]] hi_force_inline static array_type set_zero() noexcept
65 {
66 return S(_mm_setzero_si128());
67 }
68
69 [[nodiscard]] hi_force_inline static array_type set_all_ones() noexcept
70 {
71 return S(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
72 }
73
74 [[nodiscard]] hi_force_inline static array_type set_one() noexcept
75 {
76 auto const ones = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
77 return S(_mm_srli_epi16(_mm_slli_epi16(ones, 12), 2));
78 }
79
80 [[nodiscard]] hi_force_inline static array_type inv(array_type a) noexcept
81 {
82 return _xor(set_all_ones(), a);
83 }
84
85 [[nodiscard]] hi_force_inline static bool test(array_type a, array_type b) noexcept
86 {
87#if defined(HI_HAS_SSE4_1)
88 return static_cast<bool>(_mm_testz_si128(L(a), L(b)));
89#else
90 return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_and_si128(L(a), L(b)), _mm_setzero_si128())) == 0xffff;
91#endif
92 }
93
94 [[nodiscard]] hi_force_inline static array_type _or(array_type a, array_type b) noexcept
95 {
96 return S(_mm_or_si128(L(a), L(b)));
97 }
98
99 [[nodiscard]] hi_force_inline static array_type _and(array_type a, array_type b) noexcept
100 {
101 return S(_mm_and_si128(L(a), L(b)));
102 }
103
104 [[nodiscard]] hi_force_inline static array_type _xor(array_type a, array_type b) noexcept
105 {
106 return S(_mm_xor_si128(L(a), L(b)));
107 }
108
109 [[nodiscard]] hi_force_inline static array_type andnot(array_type a, array_type b) noexcept
110 {
111 return S(_mm_andnot_si128(L(a), L(b)));
112 }
113
114 [[nodiscard]] hi_force_inline static array_type sll(array_type a, unsigned int b) noexcept
115 {
116 auto const b_ = _mm_set_epi32(0, 0, 0, b);
117 return S(_mm_sll_epi16(L(a), b_));
118 }
119
120 [[nodiscard]] hi_force_inline static array_type srl(array_type a, unsigned int b) noexcept
121 {
122 auto const b_ = _mm_set_epi32(0, 0, 0, b);
123 return S(_mm_srl_epi16(L(a), b_));
124 }
125
126 [[nodiscard]] hi_force_inline static array_type sra(array_type a, unsigned int b) noexcept
127 {
128 auto const b_ = _mm_set_epi32(0, 0, 0, b);
129 return S(_mm_sra_epi16(L(a), b_));
130 }
131
132#if defined(HI_HAS_SSE4_1)
133 template<size_t Mask>
134 [[nodiscard]] hi_force_inline static array_type blend(array_type a, array_type b) noexcept
135 {
136 return S(_mm_blend_epi16(L(a), L(b), Mask));
137 }
138#endif
139};
140#endif
141
142} // namespace v1
143} // namespace v1
The HikoGUI namespace.
Definition array_generic.hpp:20
bool has_f16c() noexcept
This CPU has float-16 conversion instructions.
Definition cpu_id_x86.hpp:752
DOXYGEN BUG.
Definition algorithm_misc.hpp:20