30 _load_shuffle_indices = format.load_shuffle_indices(stride);
31 _concat_shuffle_indices = format.concat_shuffle_indices(stride);
33 _multiplier = f32x4::broadcast(format.unpack_multiplier());
34 _num_chunks_per_quad = format.num_chunks_per_quad(stride);
35 _chunk_stride = format.chunk_stride(stride);
37 _direction = format.endian == std::endian::little ? -1 : 1;
38 _start_byte = format.endian == std::endian::little ? format.num_bytes - 1 : 0;
39 _align_shift = 32 - format.num_bytes * 8;
48 void operator()(std::byte
const *hi_restrict src,
float *hi_restrict dst,
std::size_t num_samples)
const noexcept
50 hi_assert(src !=
nullptr);
51 hi_assert(dst !=
nullptr);
55 auto const *
const dst_end = dst + num_samples;
56 auto const *
const dst_fast_end = dst + _format.
num_fast_quads(_stride, num_samples) * 4;
59 while (dst != dst_fast_end) {
60 auto const int_samples =
61 load_samples(src, _load_shuffle_indices, _concat_shuffle_indices, _num_chunks_per_quad, _chunk_stride);
62 auto const float_samples = f32x4::cast_from(int_samples);
63 store_samples(dst, float_samples);
65 while (dst != dst_end) {
66 auto const int_sample = load_sample(src, _stride, _format.
num_bytes, _direction, _start_byte, _align_shift);
67 auto const float_sample = std::bit_cast<float>(int_sample);
68 store_sample(dst, float_sample);
72 auto const multiplier = _multiplier;
73 while (dst != dst_fast_end) {
74 auto const int_samples =
75 load_samples(src, _load_shuffle_indices, _concat_shuffle_indices, _num_chunks_per_quad, _chunk_stride);
76 auto const float_samples =
static_cast<f32x4>(int_samples) * multiplier;
77 store_samples(dst, float_samples);
79 while (dst != dst_end) {
80 auto const int_sample = load_sample(src, _stride, _format.
num_bytes, _direction, _start_byte, _align_shift);
81 auto const float_sample =
static_cast<float>(int_sample) * get<0>(multiplier);
82 store_sample(dst, float_sample);
133 [[nodiscard]]
static i8x16 load_samples(std::byte
const *hi_restrict & src, i8x16 load_shuffle_indices,
std::size_t stride)
noexcept