HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
float16_sse4_1.hpp
1// Copyright Take Vos 2021-2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#if defined(HI_HAS_SSE4_1)
8#include <smmintrin.h> // SSE4.1
9#include <ammintrin.h> // SSE4A
10#endif
11#if defined(HI_HAS_SSSE3)
12#include <tmmintrin.h> // SSSE3
13#endif
14#if defined(HI_HAS_SSE3)
15#include <pmmintrin.h> // SSE3
16#endif
17#if defined(HI_HAS_SSE2)
18#include <emmintrin.h> // SSE2
19#endif
20#if defined(HI_HAS_SSE)
21#include <xmmintrin.h> // SSE
22#endif
23#include "../utility.hpp"
24#include "../float16.hpp"
25
26namespace hi::inline v1 {
27
28inline __m128 _mm_cvtph_ps_sse2(__m128i value) noexcept
29{
30 hilet f32_to_f16_constants = _mm_set_epi32(0, f32_to_f16_adjustment, f32_to_f16_infinite, f32_to_f16_lowest_normal - 1);
31
32 // Convert the 16 bit values to 32 bit with leading zeros.
33 auto u = _mm_unpacklo_epi16(value, _mm_setzero_si128()); // SSE2
34
35 // Extract the sign bit.
36 hilet sign = _mm_slli_epi32(_mm_srli_epi32(u, 15), 31); // SSE2
37
38 // Strip the sign bit and align the exponent/mantissa boundary to a float 32.
39 u = _mm_srli_epi32(_mm_slli_epi32(u, 17), 4); // SSE2
40
41 // Adjust the bias. f32_to_f16_adjustment
42 hilet f32_to_f16_adjustment_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b10'10'10'10); // SSE2
43 u = _mm_add_epi32(u, f32_to_f16_adjustment_); // SSE2
44
45 // Get a mask of '1' bits when the half-float would be normal or infinite.
46 hilet f32_to_f16_lowest_normal_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b00'00'00'00); // SSE2
47 hilet is_normal = _mm_cmpgt_epi32(u, f32_to_f16_lowest_normal_); // SSE2
48
49 // Add the sign bit back in.
50 u = _mm_or_si128(u, sign); // SSE2
51
52 // Keep the value if normal, if denormal make it zero.
53 u = _mm_and_si128(u, is_normal); // SSE2
54
55 return _mm_castsi128_ps(u); // SSE2
56}
57
58inline __m128i _mm_cvtps_ph_sse4_1(__m128 value) noexcept
59{
60 hilet f32_to_f16_constants = _mm_set_epi32(0, f32_to_f16_adjustment, f32_to_f16_infinite, f32_to_f16_lowest_normal - 1);
61
62 // Interpret the floating point number as 32 bit-field.
63 auto u = _mm_castps_si128(value); // SSE2
64
65 // Get the sign of the floating point number as a bit mask of the upper 17 bits.
66 hilet sign = _mm_slli_epi32(_mm_srai_epi32(u, 31), 15); // SSE2
67
68 // Strip sign bit.
69 u = _mm_srli_epi32(_mm_slli_epi32(u, 1), 1); // SSE2
70
71 // Get a mask of '1' bits when the half-float would be normal or infinite.
72 hilet f32_to_f16_lowest_normal_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b00'00'00'00); // SSE2
73 hilet is_normal = _mm_cmpgt_epi32(u, f32_to_f16_lowest_normal_);
74
75 // Clamp the floating point number to where the half-float would be infinite.
76 hilet f32_to_f16_infinite_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b01'01'01'01); // SSE2
77 u = _mm_min_epi32(u, f32_to_f16_infinite_); // SSE4.1
78
79 // Convert the bias from float to half-float.
80 hilet f32_to_f16_adjustment_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b10'10'10'10); // SSE2
81 u = _mm_sub_epi32(u, f32_to_f16_adjustment_);
82
83 // Shift the float until it becomes a half-float. This truncates the mantissa.
84 u = _mm_srli_epi32(u, 13);
85
86 // Keep the value if normal, if denormal make it zero.
87 u = _mm_and_si128(u, is_normal);
88
89 // Add the sign bit back in, also set the upper 16 bits so that saturated pack
90 // will work correctly when converting to int16.
91 u = _mm_or_si128(u, sign);
92
93 // Saturate and pack the 32 bit integers to 16 bit integers.
94 return _mm_packs_epi32(u, u);
95}
96
97} // namespace hi::inline v1
Utilities used by the HikoGUI library itself.
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:15