8#include "cpu_id_x86.hpp"
10#include "cpu_id_generic.hpp"
23hi_export_module(hikocpu : float_to_half);
25hi_export
namespace hi {
inline namespace v1 {
27[[nodiscard]]
constexpr static uint16_t float_to_half_generic(
float a)
noexcept
29 auto u32 = std::bit_cast<uint32_t>(a);
32 auto exponent =
static_cast<int16_t
>(
static_cast<uint8_t
>((u32 << 1) >> 24)) - 127 + 15;
35 auto mantissa = (u32 << 9) >> (9 + 24 - 11);
37 auto const is_inf = exponent >= 0x1f;
44 auto const is_nan = (u32 << 1) > 0xff000000;
53 auto shift = 1 - exponent;
71 auto r =
static_cast<uint16_t
>((
static_cast<int32_t
>(u32) >> 31) << 15);
80 for (
size_t i = 0; i != 4; ++i) {
81 r[i] = float_to_half_generic(a[i]);
87hi_target(
"sse,sse2,f16c")
90 auto const a_ = _mm_loadu_ps(a.data());
91 auto const r = _mm_cvtps_ph(a_, _MM_FROUND_TO_ZERO);
92 return std::bit_cast<std::array<uint16_t,4>>(_mm_cvtsi128_si64(r));
95hi_target(
"sse,sse2,f16c")
96[[nodiscard]] hi_no_inline
inline uint16_t float_to_half_f16c(
float a)
noexcept
100 auto const r = float_to_half_f16c(a_);
101 return std::get<0>(r);
110 auto const unknown_value = _mm_undefined_si128();
111 auto const ffffffff = _mm_cmpeq_epi32(unknown_value, unknown_value);
113 auto r = _mm_castps_si128(_mm_loadu_ps(a.data()));
116 auto const sign = _mm_slli_epi32(_mm_srli_epi32(r, 31), 15);
119 r = _mm_srli_epi32(_mm_slli_epi32(r, 1), 1);
121 auto const infinite_f32 = _mm_slli_epi32(_mm_srli_epi32(ffffffff, 24), 23);
122 auto const is_nan = _mm_cmpgt_epi32(r, infinite_f32);
123 if (_mm_movemask_epi8(is_nan) != 0) {
124 return float_to_half_generic(a);
127 auto const is_zero = _mm_cmpeq_epi32(r, _mm_setzero_si128());
130 auto const exponent_adjust = _mm_slli_epi32(_mm_srli_epi32(ffffffff, 29), 27);
131 r = _mm_sub_epi32(r, exponent_adjust);
134 auto const max_denormal = _mm_srli_epi32(ffffffff, 9);
135 auto const is_denorm = _mm_andnot_si128(is_zero, _mm_cmpgt_epi32(max_denormal, r));
136 if (_mm_movemask_epi8(is_denorm) != 0) {
137 return float_to_half_generic(a);
141 r = _mm_andnot_si128(is_zero, r);
145 auto const infinite_f16_in_f32 = _mm_slli_epi32(_mm_srli_epi32(ffffffff, 27), 23);
146 auto const is_inf = _mm_cmpgt_epi32(r, infinite_f16_in_f32);
147 r = _mm_andnot_si128(is_inf, r);
148 r = _mm_or_si128(r, _mm_and_si128(is_inf, infinite_f16_in_f32));
151 r = _mm_srli_epi32(r, 13);
154 r = _mm_or_si128(r, sign);
157 r = _mm_shufflelo_epi16(r, 0b11'11'10'00);
158 r = _mm_shufflehi_epi16(r, 0b11'11'10'00);
159 r = _mm_shuffle_epi32(r, 0b11'11'10'00);
160 return std::bit_cast<std::array<uint16_t,4>>(_mm_cvtsi128_si64(r));
164[[nodiscard]] hi_no_inline
inline uint16_t float_to_half_sse2(
float a)
noexcept
168 auto const r = float_to_half_sse2(a_);
169 return std::get<0>(r);
173[[nodiscard]]
constexpr uint16_t float_to_half(
float v)
noexcept
175 if (not std::is_constant_evaluated()) {
179 auto tmp = float_to_half_f16c(v_);
180 return std::get<0>(tmp);
185 return float_to_half_generic(v);
192 if (not std::is_constant_evaluated()) {
195 return float_to_half_f16c(v);
198 return float_to_half_sse2(v);
203 for (
size_t i = 0; i != 4; ++i) {
204 r[i] = float_to_half_generic(v[i]);
The HikoGUI namespace.
Definition array_generic.hpp:20
bool has_sse2() noexcept
This CPU has the SSE2 instruction set.
Definition cpu_id_x86.hpp:672
bool has_f16c() noexcept
This CPU has float-16 conversion instructions.
Definition cpu_id_x86.hpp:752
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
@ shift
The shift key is being held.