HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
float16_sse4_1.hpp
1// Copyright Take Vos 2021-2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#if defined(HI_HAS_SSE4_1)
8#include <smmintrin.h> // SSE4.1
9#include <ammintrin.h> // SSE4A
10#endif
11#if defined(HI_HAS_SSSE3)
12#include <tmmintrin.h> // SSSE3
13#endif
14#if defined(HI_HAS_SSE3)
15#include <pmmintrin.h> // SSE3
16#endif
17#if defined(HI_HAS_SSE2)
18#include <emmintrin.h> // SSE2
19#endif
20#if defined(HI_HAS_SSE)
21#include <xmmintrin.h> // SSE
22#endif
23#include "../utility/module.hpp"
24
25namespace hi::inline v1 {
26
27inline __m128 _mm_cvtph_ps_sse2(__m128i value) noexcept
28{
29 hilet f32_to_f16_constants = _mm_set_epi32(0, f32_to_f16_adjustment, f32_to_f16_infinite, f32_to_f16_lowest_normal - 1);
30
31 // Convert the 16 bit values to 32 bit with leading zeros.
32 auto u = _mm_unpacklo_epi16(value, _mm_setzero_si128()); // SSE2
33
34 // Extract the sign bit.
35 hilet sign = _mm_slli_epi32(_mm_srli_epi32(u, 15), 31); // SSE2
36
37 // Strip the sign bit and align the exponent/mantissa boundary to a float 32.
38 u = _mm_srli_epi32(_mm_slli_epi32(u, 17), 4); // SSE2
39
40 // Adjust the bias. f32_to_f16_adjustment
41 hilet f32_to_f16_adjustment_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b10'10'10'10); // SSE2
42 u = _mm_add_epi32(u, f32_to_f16_adjustment_); // SSE2
43
44 // Get a mask of '1' bits when the half-float would be normal or infinite.
45 hilet f32_to_f16_lowest_normal_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b00'00'00'00); // SSE2
46 hilet is_normal = _mm_cmpgt_epi32(u, f32_to_f16_lowest_normal_); // SSE2
47
48 // Add the sign bit back in.
49 u = _mm_or_si128(u, sign); // SSE2
50
51 // Keep the value if normal, if denormal make it zero.
52 u = _mm_and_si128(u, is_normal); // SSE2
53
54 return _mm_castsi128_ps(u); // SSE2
55}
56
57inline __m128i _mm_cvtps_ph_sse4_1(__m128 value) noexcept
58{
59 hilet f32_to_f16_constants = _mm_set_epi32(0, f32_to_f16_adjustment, f32_to_f16_infinite, f32_to_f16_lowest_normal - 1);
60
61 // Interpret the floating point number as 32 bit-field.
62 auto u = _mm_castps_si128(value); // SSE2
63
64 // Get the sign of the floating point number as a bit mask of the upper 17 bits.
65 hilet sign = _mm_slli_epi32(_mm_srai_epi32(u, 31), 15); // SSE2
66
67 // Strip sign bit.
68 u = _mm_srli_epi32(_mm_slli_epi32(u, 1), 1); // SSE2
69
70 // Get a mask of '1' bits when the half-float would be normal or infinite.
71 hilet f32_to_f16_lowest_normal_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b00'00'00'00); // SSE2
72 hilet is_normal = _mm_cmpgt_epi32(u, f32_to_f16_lowest_normal_);
73
74 // Clamp the floating point number to where the half-float would be infinite.
75 hilet f32_to_f16_infinite_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b01'01'01'01); // SSE2
76 u = _mm_min_epi32(u, f32_to_f16_infinite_); // SSE4.1
77
78 // Convert the bias from float to half-float.
79 hilet f32_to_f16_adjustment_ = _mm_shuffle_epi32(f32_to_f16_constants, 0b10'10'10'10); // SSE2
80 u = _mm_sub_epi32(u, f32_to_f16_adjustment_);
81
82 // Shift the float until it becomes a half-float. This truncates the mantissa.
83 u = _mm_srli_epi32(u, 13);
84
85 // Keep the value if normal, if denormal make it zero.
86 u = _mm_and_si128(u, is_normal);
87
88 // Add the sign bit back in, also set the upper 16 bits so that saturated pack
89 // will work correctly when converting to int16.
90 u = _mm_or_si128(u, sign);
91
92 // Saturate and pack the 32 bit integers to 16 bit integers.
93 return _mm_packs_epi32(u, u);
94}
95
96} // namespace hi::inline v1
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23
DOXYGEN BUG.
Definition algorithm.hpp:13