HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
float_to_half.hpp
1
2
3
4#pragma once
5
6#include "macros.hpp"
7#if defined(HI_HAS_X86)
8#include "cpu_id_x86.hpp"
9#else
10#include "cpu_id_generic.hpp"
11#endif
12#include <cstdint>
13#include <bit>
14#include <type_traits>
15#include <array>
16
17#ifdef HI_HAS_X86
18#include <immintrin.h>
19#include <emmintrin.h>
20#include <smmintrin.h>
21#endif
22
23hi_export_module(hikocpu : float_to_half);
24
25hi_export namespace hi { inline namespace v1 {
26
27[[nodiscard]] constexpr static uint16_t float_to_half_generic(float a) noexcept
28{
29 auto u32 = std::bit_cast<uint32_t>(a);
30
31 // Extract exponent.
32 auto exponent = static_cast<int16_t>(static_cast<uint8_t>((u32 << 1) >> 24)) - 127 + 15;
33
34 // Extract the 24-bit mantissa.
35 auto mantissa = (u32 << 9) >> (9 + 24 - 11);
36
37 auto const is_inf = exponent >= 0x1f;
38 if (is_inf) {
39 exponent = 0x1f;
40 mantissa = 0;
41 }
42
43 // Handle NaN.
44 auto const is_nan = (u32 << 1) > 0xff000000;
45 if (is_nan) {
46 mantissa = 1;
47 }
48
49 // Add implicit leading bit.
50 mantissa |= 0x0400;
51
52 // Shift mantissa when denormalizing.
53 auto shift = 1 - exponent;
54 if (shift < 0) {
55 shift = 0;
56 }
57 if (shift > 31) {
58 shift = 31;
59 }
60 mantissa >>= shift;
61
62 // Adjust exponent for denormals and zero.
63 if (exponent < 0) {
64 exponent = 0;
65 }
66
67 // Remove implicit leading bit.
68 mantissa &= 0x03ff;
69
70 // Extract sign.
71 auto r = static_cast<uint16_t>((static_cast<int32_t>(u32) >> 31) << 15);
72 r |= exponent << 10;
73 r |= mantissa;
74 return r;
75}
76
77[[nodiscard]] constexpr std::array<uint16_t,4> float_to_half_generic(std::array<float,4> a) noexcept
78{
79 auto r = std::array<uint16_t,4>{};
80 for (size_t i = 0; i != 4; ++i) {
81 r[i] = float_to_half_generic(a[i]);
82 }
83 return r;
84}
85
86#if HI_HAS_X86
87hi_target("sse,sse2,f16c")
88[[nodiscard]] inline std::array<uint16_t,4> float_to_half_f16c(std::array<float,4> a) noexcept
89{
90 auto const a_ = _mm_loadu_ps(a.data());
91 auto const r = _mm_cvtps_ph(a_, _MM_FROUND_TO_ZERO);
92 return std::bit_cast<std::array<uint16_t,4>>(_mm_cvtsi128_si64(r));
93}
94
95hi_target("sse,sse2,f16c")
96[[nodiscard]] hi_no_inline inline uint16_t float_to_half_f16c(float a) noexcept
97{
98 auto a_ = std::array<float, 4>{};
99 std::get<0>(a_) = a;
100 auto const r = float_to_half_f16c(a_);
101 return std::get<0>(r);
102}
103#endif
104
105
106#if HI_HAS_X86
107hi_target("sse,sse2")
108[[nodiscard]] inline std::array<uint16_t,4> float_to_half_sse2(std::array<float,4> a) noexcept
109{
110 auto const unknown_value = _mm_undefined_si128();
111 auto const ffffffff = _mm_cmpeq_epi32(unknown_value, unknown_value);
112
113 auto r = _mm_castps_si128(_mm_loadu_ps(a.data()));
114
115 // Extract the sign into bit 15.
116 auto const sign = _mm_slli_epi32(_mm_srli_epi32(r, 31), 15);
117
118 // Strip off the sign.
119 r = _mm_srli_epi32(_mm_slli_epi32(r, 1), 1);
120
121 auto const infinite_f32 = _mm_slli_epi32(_mm_srli_epi32(ffffffff, 24), 23);
122 auto const is_nan = _mm_cmpgt_epi32(r, infinite_f32);
123 if (_mm_movemask_epi8(is_nan) != 0) {
124 return float_to_half_generic(a);
125 }
126
127 auto const is_zero = _mm_cmpeq_epi32(r, _mm_setzero_si128());
128
129 // Subtract 112 from the exponent.
130 auto const exponent_adjust = _mm_slli_epi32(_mm_srli_epi32(ffffffff, 29), 27);
131 r = _mm_sub_epi32(r, exponent_adjust);
132
133 // If after adjustment the exponent is zero or less, then it is a denormal.
134 auto const max_denormal = _mm_srli_epi32(ffffffff, 9);
135 auto const is_denorm = _mm_andnot_si128(is_zero, _mm_cmpgt_epi32(max_denormal, r));
136 if (_mm_movemask_epi8(is_denorm) != 0) {
137 return float_to_half_generic(a);
138 }
139
140 // Make sure the value is zero if the original was zero.
141 r = _mm_andnot_si128(is_zero, r);
142
143 // If after adjustment the exponent is greater or equal to 0x1f then the value is infinite.
144 // Then make the value not go over infinite.
145 auto const infinite_f16_in_f32 = _mm_slli_epi32(_mm_srli_epi32(ffffffff, 27), 23);
146 auto const is_inf = _mm_cmpgt_epi32(r, infinite_f16_in_f32);
147 r = _mm_andnot_si128(is_inf, r);
148 r = _mm_or_si128(r, _mm_and_si128(is_inf, infinite_f16_in_f32));
149
150 // Shift to fit inside 16-bits.
151 r = _mm_srli_epi32(r, 13);
152
153 // Add the sign back in.
154 r = _mm_or_si128(r, sign);
155
156 // Pack 16 bit values in lower half.
157 r = _mm_shufflelo_epi16(r, 0b11'11'10'00);
158 r = _mm_shufflehi_epi16(r, 0b11'11'10'00);
159 r = _mm_shuffle_epi32(r, 0b11'11'10'00);
160 return std::bit_cast<std::array<uint16_t,4>>(_mm_cvtsi128_si64(r));
161}
162
163hi_target("sse,sse2")
164[[nodiscard]] hi_no_inline inline uint16_t float_to_half_sse2(float a) noexcept
165{
166 auto a_ = std::array<float, 4>{};
167 std::get<0>(a_) = a;
168 auto const r = float_to_half_sse2(a_);
169 return std::get<0>(r);
170}
171#endif
172
173[[nodiscard]] constexpr uint16_t float_to_half(float v) noexcept
174{
175 if (not std::is_constant_evaluated()) {
176#if HI_HAS_X86
177 if (has_f16c()) {
178 auto v_ = std::array<float,4>{v, v, v, v};
179 auto tmp = float_to_half_f16c(v_);
180 return std::get<0>(tmp);
181 }
182#endif
183 }
184
185 return float_to_half_generic(v);
186}
187
188[[nodiscard]] constexpr std::array<uint16_t, 4> float_to_half(std::array<float, 4> v) noexcept
189{
190 auto r = std::array<uint16_t, 4>{};
191
192 if (not std::is_constant_evaluated()) {
193#if HI_HAS_X86
194 if (has_f16c()) {
195 return float_to_half_f16c(v);
196 }
197 if (has_sse2()) {
198 return float_to_half_sse2(v);
199 }
200#endif
201 }
202
203 for (size_t i = 0; i != 4; ++i) {
204 r[i] = float_to_half_generic(v[i]);
205 }
206 return r;
207}
208
209}}
210
The HikoGUI namespace.
Definition array_generic.hpp:20
bool has_sse2() noexcept
This CPU has the SSE2 instruction set.
Definition cpu_id_x86.hpp:672
bool has_f16c() noexcept
This CPU has float-16 conversion instructions.
Definition cpu_id_x86.hpp:752
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
@ shift
The shift key is being held.