HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
float16_sse4_1.hpp
1// Copyright Take Vos 2021-2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#if defined(HI_HAS_SSE4_1)
8#include <smmintrin.h> // SSE4.1
9#include <ammintrin.h> // SSE4A
10#endif
11#if defined(HI_HAS_SSSE3)
12#include <tmmintrin.h> // SSSE3
13#endif
14#if defined(HI_HAS_SSE3)
15#include <pmmintrin.h> // SSE3
16#endif
17#if defined(HI_HAS_SSE2)
18#include <emmintrin.h> // SSE2
19#endif
20#if defined(HI_HAS_SSE)
21#include <xmmintrin.h> // SSE
22#endif
23#include "../utility/utility.hpp"
24
25
26
27namespace hi::inline v1 {
28
29inline __m128 _mm_cvtph_ps_sse2(__m128i value) noexcept
30{
31 hilet f32_to_f16_constants = _mm_set_epi32(0, f32_to_f16_adjustment, f32_to_f16_infinite, f32_to_f16_lowest_normal - 1);
32
33 // Convert the 16 bit values to 32 bit with leading zeros.
34 auto u = _mm_unpacklo_epi16(value, _mm_setzero_si128()); // SSE2
35
36 // Extract the sign bit.
37 hilet sign = _mm_slli_epi32(_mm_srli_epi32(u, 15), 31); // SSE2
38
39 // Strip the sign bit and align the exponent/mantissa boundary to a float 32.
40 u = _mm_srli_epi32(_mm_slli_epi32(u, 17), 4); // SSE2
41
42 // Adjust the bias. f32_to_f16_adjustment
45
46 // Get a mask of '1' bits when the half-float would be normal or infinite.
49
50 // Add the sign bit back in.
51 u = _mm_or_si128(u, sign); // SSE2
52
53 // Keep the value if normal, if denormal make it zero.
54 u = _mm_and_si128(u, is_normal); // SSE2
55
56 return _mm_castsi128_ps(u); // SSE2
57}
58
59inline __m128i _mm_cvtps_ph_sse4_1(__m128 value) noexcept
60{
61 hilet f32_to_f16_constants = _mm_set_epi32(0, f32_to_f16_adjustment, f32_to_f16_infinite, f32_to_f16_lowest_normal - 1);
62
63 // Interpret the floating point number as 32 bit-field.
64 auto u = _mm_castps_si128(value); // SSE2
65
66 // Get the sign of the floating point number as a bit mask of the upper 17 bits.
67 hilet sign = _mm_slli_epi32(_mm_srai_epi32(u, 31), 15); // SSE2
68
69 // Strip sign bit.
70 u = _mm_srli_epi32(_mm_slli_epi32(u, 1), 1); // SSE2
71
72 // Get a mask of '1' bits when the half-float would be normal or infinite.
75
76 // Clamp the floating point number to where the half-float would be infinite.
77 hilet f32_to_f16_infinite_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b01'01'01'01); // SSE2
79
80 // Convert the bias from float to half-float.
83
84 // Shift the float until it becomes a half-float. This truncates the mantissa.
85 u = _mm_srli_epi32(u, 13);
86
87 // Keep the value if normal, if denormal make it zero.
89
90 // Add the sign bit back in, also set the upper 16 bits so that saturated pack
91 // will work correctly when converting to int16.
92 u = _mm_or_si128(u, sign);
93
94 // Saturate and pack the 32 bit integers to 16 bit integers.
95 return _mm_packs_epi32(u, u);
96}
97
98} // namespace hi::inline v1
DOXYGEN BUG.
Definition algorithm.hpp:16
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377