hikogui/msvs-bug-20230428/a00134_source.html

// Copyright Take Vos 2022, 2023.

// Distributed under the Boost Software License, Version 1.0.

// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)


#pragma once


#include "native_simd_utility.hpp"

#include "../utility/module.hpp"

#include <span>

#include <array>

#include <ostream>


namespace hi { inline namespace v1 {


#ifdef HI_HAS_AVX


template<>

struct native_simd<double,4> {

    using value_type = double;

    constexpr static size_t size = 4;

    using array_type = std::array<value_type, size>;

    using register_type = __m256d;


    register_type v;


    native_simd(native_simd const&) noexcept = default;

    native_simd(native_simd&&) noexcept = default;

    native_simd& operator=(native_simd const&) noexcept = default;

    native_simd& operator=(native_simd&&) noexcept = default;


    native_simd() noexcept : v(_mm256_setzero_pd()) {}


    [[nodiscard]] explicit native_simd(register_type other) noexcept : v(other) {}


    [[nodiscard]] explicit operator register_type() const noexcept

    {

        return v;

    }


    [[nodiscard]] native_simd(

        value_type a,

        value_type b = value_type{0},

        value_type c = value_type{0},

        value_type d = value_type{0}) noexcept :

        v(_mm256_set_pd(d, c, b, a))

    {

    }


    [[nodiscard]] explicit native_simd(value_type const *other) noexcept : v(_mm256_loadu_pd(other)) {}


    void store(value_type *out) const noexcept

    {

        hi_axiom_not_null(out);

        _mm256_storeu_pd(out, v);

    }


    [[nodiscard]] explicit native_simd(void const *other) noexcept : v(_mm256_loadu_pd(static_cast<value_type const *>(other))) {}


    void store(void *out) const noexcept

    {

        hi_axiom_not_null(out);

        _mm256_storeu_pd(static_cast<value_type *>(out), v);

    }


    [[nodiscard]] explicit native_simd(std::span<value_type const> other) noexcept

    {

        hi_axiom(other.size() >= size);

        v = _mm256_loadu_pd(other.data());

    }


    void store(std::span<value_type> out) const noexcept

    {

        hi_axiom(out.size() >= size);

        _mm256_storeu_pd(out.data(), v);

    }


    [[nodiscard]] explicit native_simd(array_type other) noexcept : v(_mm256_loadu_pd(other.data())) {}


    [[nodiscard]] explicit operator array_type() const noexcept

    {

        auto r = array_type{};

        _mm256_storeu_pd(r.data(), v);

        return r;

    }


    [[nodiscard]] explicit native_simd(native_simd<float, 4> const& a) noexcept;

    [[nodiscard]] explicit native_simd(native_simd<int32_t, 4> const& a) noexcept;

    //[[nodiscard]] explicit native_simd(native_f64x2 const &a, native_f64x2 const &b) noexcept;


    [[nodiscard]] static native_simd broadcast(value_type a) noexcept

    {

        return native_simd{_mm256_set1_pd(a)};

    }


    [[nodiscard]] static native_simd broadcast(native_simd a) noexcept

    {

#ifdef HI_HAS_AVX2

        return native_simd{_mm256_permute4x64_pd(a.v, 0b00'00'00'00)};

#else

        hilet tmp = _mm256_permute_pd(a.v, 0b0000);

        return native_simd{_mm256_permute2f128_pd(tmp, tmp, 0b0000'0000)};

#endif

    }


    [[nodiscard]] static native_simd ones() noexcept

    {

#ifdef HI_HAS_AVX2

        auto ones = _mm256_undefined_si256();

        ones = _mm256_cmpeq_epi32(ones, ones);

        return native_simd{_mm256_castsi256_pd(ones)};

#else

        auto ones = _mm256_setzero_pd();

        ones = _mm256_cmpeq_pd(ones, ones);

        return native_simd{ones};

#endif

    }


    [[nodiscard]] static native_simd from_mask(size_t a) noexcept

    {

        hi_axiom(a <= 0b1111);


        uint64_t a_ = a;


        a_ <<= 31;

        auto tmp = _mm_cvtsi32_si128(truncate<uint32_t>(a_));

        a_ >>= 1;

        tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 1);

        a_ >>= 1;

        tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 2);

        a_ >>= 1;

        tmp = _mm_insert_epi32(tmp, truncate<uint32_t>(a_), 3);


        tmp = _mm_srai_epi32(tmp, 31);

        return native_simd{_mm256_castsi256_pd(_mm256_cvtepi32_epi64(tmp))};

    }


    [[nodiscard]] size_t mask() const noexcept

    {

        return narrow_cast<size_t>(_mm256_movemask_pd(v));

    }


    [[nodiscard]] friend bool equal(native_simd a, native_simd b) noexcept

    {

        return _mm256_movemask_pd(_mm256_cmp_pd(a.v, b.v, _CMP_EQ_UQ)) == 0b1111;

    }


    [[nodiscard]] friend native_simd

    almost_eq(native_simd a, native_simd b, value_type epsilon = std::numeric_limits<value_type>::epsilon()) noexcept

    {

        auto abs_diff = abs(a - b);

        return abs_diff < broadcast(epsilon);

    }


    [[nodiscard]] friend bool

    almost_equal(native_simd a, native_simd b, value_type epsilon = std::numeric_limits<value_type>::epsilon())

    {

        return almost_eq(a, b, epsilon).mask() == 0b1111;

    }


    [[nodiscard]] friend native_simd operator==(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ)};

    }


    [[nodiscard]] friend native_simd operator!=(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_NEQ_UQ)};

    }


    [[nodiscard]] friend native_simd operator<(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_LT_OQ)};

    }


    [[nodiscard]] friend native_simd operator>(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_GT_OQ)};

    }


    [[nodiscard]] friend native_simd operator<=(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_LE_OQ)};

    }


    [[nodiscard]] friend native_simd operator>=(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_cmp_pd(a.v, b.v, _CMP_GE_OQ)};

    }


    [[nodiscard]] friend native_simd operator+(native_simd a) noexcept

    {

        return a;

    }


    [[nodiscard]] friend native_simd operator+(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_add_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd operator-(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_sub_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd operator-(native_simd a) noexcept

    {

        return native_simd{} - a;

    }


    [[nodiscard]] friend native_simd operator*(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_mul_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd operator/(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_div_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd operator&(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_and_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd operator|(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_or_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd operator^(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_xor_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd operator~(native_simd a) noexcept

    {

        return not_and(a, ones());

    }


    [[nodiscard]] friend native_simd min(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_min_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd max(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_max_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd abs(native_simd a) noexcept

    {

        return not_and(broadcast(-0.0f), a);

    }


    [[nodiscard]] friend native_simd floor(native_simd a) noexcept

    {

        return native_simd{_mm256_floor_pd(a.v)};

    }


    [[nodiscard]] friend native_simd ceil(native_simd a) noexcept

    {

        return native_simd{_mm256_ceil_pd(a.v)};

    }


    template<native_rounding_mode Rounding = native_rounding_mode::current>

    [[nodiscard]] friend native_simd round(native_simd a) noexcept

    {

        return native_simd{_mm256_round_pd(a.v, to_underlying(Rounding))};

    }


    [[nodiscard]] friend native_simd rcp(native_simd a) noexcept

    {

        return native_simd{_mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), a.v)};

    }


    [[nodiscard]] friend native_simd sqrt(native_simd a) noexcept

    {

        return native_simd{_mm256_sqrt_pd(a.v)};

    }


    [[nodiscard]] friend native_simd rsqrt(native_simd a) noexcept

    {

        return rcp(sqrt(a));

    }


    template<size_t Mask>

    [[nodiscard]] friend native_simd set_zero(native_simd a) noexcept

    {

        static_assert(Mask <= 0b1111);

        return blend<Mask>(a, native_simd{});

    }


    template<size_t Index>

    [[nodiscard]] friend native_simd insert(native_simd a, value_type b) noexcept

    {

        static_assert(Index < 4);

        return blend<1_uz << Index>(a, broadcast(b));

    }


    template<size_t Index>

    [[nodiscard]] friend value_type get(native_simd a) noexcept

    {

        static_assert(Index < size);


#ifdef HI_HAS_AVX2

        return _mm256_cvtsd_f64(_mm256_permute4x64_pd(a.v, Index));


#else

        constexpr auto hi_index = Index / (size / 2);

        constexpr auto lo_index = Index % (size / 2);


        hilet hi = _mm256_extractf128_pd(a.v, hi_index);

        hilet lo = _mm_permute_pd(hi, lo_index);

        return _mm_cvtsd_f64(lo);

#endif

    }


    template<size_t Mask>

    [[nodiscard]] friend native_simd blend(native_simd a, native_simd b) noexcept

    {

        static_assert(Mask <= 0b1111);


        if constexpr (Mask == 0b0000) {

            return a;

        } else if constexpr (Mask == 0b1111) {

            return b;

        } else {

            return native_simd{_mm256_blend_pd(a.v, b.v, Mask)};

        }

    }


    template<fixed_string SourceElements>

    [[nodiscard]] friend native_simd permute(native_simd a) noexcept

    {

        static_assert(SourceElements.size() == size);

        constexpr auto order = detail::native_swizzle_to_packed_indices<SourceElements, size>();


#if HI_HAS_AVX2

        if constexpr (order == 0b11'10'01'00) {

            return a;

        } else {

            return native_simd{_mm256_permute4x64_pd(a.v, order)};

        }


#else

        // clang-format off

        constexpr auto hi_order =

            ((order & 0b00'00'00'10) >> 1) |

            ((order & 0b00'00'10'00) >> 2) |

            ((order & 0b00'10'00'00) >> 3) |

            ((order & 0b10'00'00'00) >> 4);

        constexpr auto lo_order =

             (order & 0b00'00'00'01) |

            ((order & 0b00'00'01'00) >> 1) |

            ((order & 0b00'01'00'00) >> 2) |

            ((order & 0b01'00'00'00) >> 3);

        // clang-format on


        if constexpr (order == 0b11'10'01'00) {

            return a;

        } else if constexpr (order == 0b00'00'00'00) {

            return broadcast(a);

        } else if constexpr (hi_order == 0b1100) {

            return native_simd{_mm256_permute_pd(a.v, lo_order)};

        } else if constexpr (hi_order == 0b0011) {

            hilet tmp = _mm256_permute2f128_pd(a.v, a.v, 0b0000'0001);

            return native_simd{_mm256_permute_pd(tmp, lo_order)};

        } else if constexpr (hi_order == 0b1111) {

            hilet tmp = _mm256_permute2f128_pd(a.v, a.v, 0b0001'0001);

            return native_simd{_mm256_permute_pd(tmp, lo_order)};

        } else if constexpr (hi_order == 0b0000) {

            hilet tmp = _mm256_permute2f128_pd(a.v, a.v, 0b0000'0000);

            return native_simd{_mm256_permute_pd(tmp, lo_order)};

        } else {

            hilet hi_0 = _mm256_permute2f128_pd(a.v, a.v, 0b0000'0000);

            hilet hi_1 = _mm256_permute2f128_pd(a.v, a.v, 0b0001'0001);

            hilet lo_0 = _mm256_permute_pd(hi_0, lo_order);

            hilet lo_1 = _mm256_permute_pd(hi_1, lo_order);

            return native_simd{_mm256_blend_pd(lo_0, lo_1, hi_order)};

        }

#endif

    }


    template<fixed_string SourceElements>

    [[nodiscard]] friend native_simd swizzle(native_simd a) noexcept

    {

        static_assert(SourceElements.size() == size);

        constexpr auto one_mask = detail::native_swizzle_to_mask<SourceElements, size, '1'>();

        constexpr auto zero_mask = detail::native_swizzle_to_mask<SourceElements, size, '0'>();

        constexpr auto number_mask = one_mask | zero_mask;


        if constexpr (number_mask == 0b1111) {

            // Swizzle was /[01][01][01][01]/.

            return swizzle_numbers<SourceElements>();


        } else if constexpr (number_mask == 0b0000) {

            // Swizzle was /[^01][^01][^01][^01]/.

            return permute<SourceElements>(a);


#ifdef HI_HAS_SSE4_1

        } else if constexpr (number_mask == zero_mask) {

            // Swizzle was /[^1][^1][^1][^1]/.

            hilet ordered = permute<SourceElements>(a);

            return set_zero<zero_mask>(ordered);

#endif


        } else {

            hilet ordered = permute<SourceElements>(a);

            hilet numbers = swizzle_numbers<SourceElements>();

            return blend<number_mask>(ordered, numbers);

        }

    }


    [[nodiscard]] friend native_simd horizontal_add(native_simd a, native_simd b) noexcept

    {

        return permute<"acbd">(native_simd{_mm256_hadd_pd(a.v, b.v)});

    }


    [[nodiscard]] friend native_simd horizontal_sub(native_simd a, native_simd b) noexcept

    {

        return permute<"acbd">(native_simd{_mm256_hsub_pd(a.v, b.v)});

    }


    [[nodiscard]] friend native_simd horizontal_sum(native_simd a) noexcept

    {

        hilet tmp = horizontal_add(a, a);

        return native_simd{_mm256_hadd_pd(tmp.v, tmp.v)};

    }


    [[nodiscard]] friend native_simd interleaved_sub_add(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_addsub_pd(a.v, b.v)};

    }


    [[nodiscard]] friend native_simd not_and(native_simd a, native_simd b) noexcept

    {

        return native_simd{_mm256_andnot_pd(a.v, b.v)};

    }


    friend std::ostream& operator<<(std::ostream& a, native_simd b) noexcept

    {

        return a << "(" << get<0>(b) << ", " << get<1>(b) << ", " << get<2>(b) << ", " << get<3>(b) << ")";

    }


    template<fixed_string SourceElements>

    [[nodiscard]] static native_simd swizzle_numbers() noexcept

    {

        constexpr auto one_mask = detail::native_swizzle_to_mask<SourceElements, size, '1'>();

        constexpr auto zero_mask = detail::native_swizzle_to_mask<SourceElements, size, '0'>();

        constexpr auto number_mask = one_mask | zero_mask;

        constexpr auto alpha_mask = ~number_mask & 0b1111;


        if constexpr ((zero_mask | alpha_mask) == 0b1111) {

            return {};


        } else if constexpr ((one_mask | alpha_mask) == 0b1111) {

            return broadcast(1.0f);


        } else {

            return native_simd{

                to_bool(one_mask & 0b0001) ? 1.0f : 0.0f,

                to_bool(one_mask & 0b0010) ? 1.0f : 0.0f,

                to_bool(one_mask & 0b0100) ? 1.0f : 0.0f,

                to_bool(one_mask & 0b1000) ? 1.0f : 0.0f};

        }

    }

};


#endif


}} // namespace hi::v1

hi_axiom
#define hi_axiom(expression,...)
Specify an axiom; an expression that is true.
Definition assert.hpp:253

hi_axiom_not_null
#define hi_axiom_not_null(expression,...)
Assert if an expression is not nullptr.
Definition assert.hpp:272

hilet
#define hilet
Invariant should be the default for variables.
Definition utility.hpp:23

hi::v1::line_end_cap::round
@ round
The end cap of the line is round.

hi::v1::gui_event_variant::other
@ other
The gui_event does not have associated data.

v1
DOXYGEN BUG.
Definition algorithm.hpp:13

hi
geometry/margins.hpp
Definition cache.hpp:11

std::array

std::ostream

std::ceil
T ceil(T... args)

std::equal
T equal(T... args)

std::floor
T floor(T... args)

std::max
T max(T... args)

std::min
T min(T... args)

std::numeric_limits

std::rel_ops::operator!=
T operator!=(T... args)

std::sqrt
T sqrt(T... args)