HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
int_carry.hpp
1// Copyright Take Vos 2019, 2021-2022.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../utility/utility.hpp"
8#include "../macros.hpp"
9#include <complex>
10#include <cmath>
11#include <limits>
12#include <span>
13#include <tuple>
14#include <concepts>
15#include <compare>
16
17
18#if HI_COMPILER == HI_CC_MSVC
19#include <intrin.h>
20#endif
21#if HI_PROCESSOR == HI_CPU_X86_64
22#include <immintrin.h>
23#endif
24
25hi_export_module(hikogui.numeric.int_carry);
26
27hi_warning_push();
28// C4702 unreachable code: Suppressed due intrinsics and std::is_constant_evaluated()
29hi_warning_ignore_msvc(4702);
30// C26472: Don't use static_cast for arithmetic conversions... (type.1)
31// int_carry does many static_casts on purpose.
32hi_warning_ignore_msvc(26472)
33
34hi_export namespace hi {
35
43template<std::unsigned_integral T>
44[[nodiscard]] hi_force_inline constexpr T get_bit(T const *lhs, std::size_t index) noexcept
45{
46 constexpr std::size_t bits_per_digit = sizeof(T) * CHAR_BIT;
47
48 auto const digit_count = index / bits_per_digit;
49 auto const bit_count = index % bits_per_digit;
50
51 return (lhs[digit_count] >> bit_count) & 1;
52}
53
61template<std::unsigned_integral T>
62constexpr void set_bit(T *r, std::size_t index, T value = T{1}) noexcept
63{
64 hi_axiom(value <= 1);
65
66 constexpr std::size_t bits_per_digit = sizeof(T) * CHAR_BIT;
67
68 auto const digit_count = index / bits_per_digit;
69 auto const bit_count = index % bits_per_digit;
70
71 value <<= bit_count;
72 auto const mask = ~(T{1} << bit_count);
73 r[digit_count] = (r[digit_count] & mask) | value;
74}
75
82template<std::unsigned_integral T>
83hi_force_inline constexpr std::pair<T, T> sll_carry(T lhs, std::size_t rhs, T carry = T{0}) noexcept
84{
85 constexpr auto num_bits = sizeof(T) * CHAR_BIT;
86 auto const reverse_count = num_bits - rhs;
87
88 return {(lhs << rhs) | carry, lhs >> reverse_count};
89}
90
97template<std::unsigned_integral T>
98hi_force_inline constexpr std::pair<T, T> srl_carry(T lhs, std::size_t rhs, T carry = T{0}) noexcept
99{
100 constexpr auto num_bits = sizeof(T) * CHAR_BIT;
101 auto const reverse_count = num_bits - rhs;
102
103 return {(lhs >> rhs) | carry, lhs << reverse_count};
104}
105
111template<std::unsigned_integral T>
112hi_force_inline constexpr std::pair<T, T> sra_carry(T lhs, std::size_t rhs) noexcept
113{
114 using S = std::make_signed_t<T>;
115
116 constexpr auto num_bits = sizeof(T) * CHAR_BIT;
117 auto const reverse_count = num_bits - rhs;
118
119 return {(static_cast<S>(lhs) >> rhs), lhs << reverse_count};
120}
121
128template<std::unsigned_integral T>
129hi_force_inline constexpr std::pair<T, T> add_carry(T lhs, T rhs, T carry = T{0}) noexcept
130{
131 hi_axiom(carry <= 1);
132
133 constexpr std::size_t num_bits = sizeof(T) * CHAR_BIT;
134
135 if constexpr (has_native_uintxx_v<num_bits * 2>) {
136 // We can use a native type that has double the size.
137 using U = make_uintxx_t<num_bits * 2>;
138
139 auto const r = static_cast<U>(lhs) + static_cast<U>(rhs) + static_cast<U>(carry);
140 return {static_cast<T>(r), static_cast<T>(r >> num_bits)};
141
142 } else if (not std::is_constant_evaluated()) {
143#if HI_COMPILER == HI_CC_MSVC
144 uint64_t r;
145 auto const c = _addcarry_u64(static_cast<unsigned char>(carry), lhs, rhs, &r);
146 return {r, static_cast<T>(c)};
147#endif
148 }
149
150 // Carry can directly be added the sum without a double overflow.
151 auto const r = static_cast<T>(lhs + rhs + carry);
152 auto const c = static_cast<T>(r < lhs or r < rhs);
153 return {r, c};
154}
155
167template<std::unsigned_integral T>
168hi_force_inline constexpr std::pair<T, T> mul_carry(T lhs, T rhs, T carry = T{0}, T accumulator = T{0}) noexcept
169{
170 constexpr std::size_t num_bits = sizeof(T) * CHAR_BIT;
171
172 if constexpr (has_native_uintxx_v<num_bits * 2>) {
173 // We can use a native type that has double the size.
174 using U = make_uintxx_t<num_bits * 2>;
175
176 auto const r = static_cast<U>(lhs) * static_cast<U>(rhs) + static_cast<U>(carry) + static_cast<U>(accumulator);
177 return {static_cast<T>(r), static_cast<T>(r >> num_bits)};
178
179 } else if (not std::is_constant_evaluated()) {
180#if HI_COMPILER == HI_CC_MSVC
181 if constexpr (sizeof(T) == 8) {
182 uint64_t hi = 0;
183 uint64_t lo = _umul128(lhs, rhs, &hi);
184 uint64_t c = 0;
185 std::tie(lo, c) = add_carry(lo, carry, uint64_t{0});
186 std::tie(hi, c) = add_carry(hi, uint64_t{0}, c);
187 std::tie(lo, c) = add_carry(lo, accumulator, uint64_t{0});
188 std::tie(hi, c) = add_carry(hi, uint64_t{0}, c);
189 return {lo, hi};
190 }
191#endif
192 }
193
194 constexpr std::size_t num_half_bits = num_bits / 2;
195 constexpr T half_mask = (T{1} << num_half_bits) - T{1};
196
197 auto const A = lhs >> num_half_bits;
198 auto const B = lhs & half_mask;
199 auto const C = rhs >> num_half_bits;
200 auto const D = rhs & half_mask;
201 auto const AC = A * C;
202 auto const AD = A * D;
203 auto const BC = B * C;
204 auto const BD = B * D;
205
206 // Provisional result.
207 auto hi = AC;
208 auto lo = BD;
209 auto c = T{0};
210
211 // AD and BC are shifted half way across the lo and hi of the result.
212 auto const AD_lo = AD << num_half_bits;
213 auto const AD_hi = AD >> num_half_bits;
214 auto const BC_lo = BC << num_half_bits;
215 auto const BC_hi = BC >> num_half_bits;
216
217 std::tie(lo, c) = add_carry(lo, AD_lo, T{0});
218 std::tie(hi, c) = add_carry(hi, AD_hi, c);
219 std::tie(lo, c) = add_carry(lo, BC_lo, T{0});
220 std::tie(hi, c) = add_carry(hi, BC_hi, c);
221
222 // Now add the carry and accumulator arguments.
223 std::tie(lo, c) = add_carry(lo, carry, T{0});
224 std::tie(hi, c) = add_carry(hi, T{0}, c);
225 std::tie(lo, c) = add_carry(lo, accumulator, T{0});
226 std::tie(hi, c) = add_carry(hi, T{0}, c);
227 return {lo, hi};
228}
229
239template<std::unsigned_integral T>
240hi_force_inline constexpr T wide_div(T lhs_lo, T lhs_hi, T rhs) noexcept
241{
242 if constexpr (sizeof(T) == 1) {
243 auto const lhs = static_cast<uint16_t>(lhs_hi) << 8 | static_cast<uint16_t>(lhs_lo);
244 return narrow_cast<uint8_t>(lhs / rhs);
245
246 } else if constexpr (sizeof(T) == 2) {
247 auto const lhs = static_cast<uint32_t>(lhs_hi) << 16 | static_cast<uint32_t>(lhs_lo);
248 return narrow_cast<uint16_t>(lhs / rhs);
249
250 } else if constexpr (sizeof(T) == 4) {
251 auto const lhs = static_cast<uint64_t>(lhs_hi) << 32 | static_cast<uint64_t>(lhs_lo);
252 return narrow_cast<uint32_t>(lhs / rhs);
253
254 } else if constexpr (sizeof(T) == 8) {
255#if HI_COMPILER == HI_CC_MSVC
256 uint64_t remainder;
257 return _udiv128(lhs_hi, lhs_lo, rhs, &remainder);
258
259#elif HI_COMPILER == HI_CC_CLANG && HI_STD_LIBRARY == HI_STL_MS
260 // clang build against the MS-STL does not have udiv128 nor can it do __int128 division.
261 // Implement binary division.
262 asm("divq %[d]"
263 : "+d" (lhs_hi), "+a" (lhs_lo)
264 : [d] "r" (rhs)
265 : "cc"
266 );
267 return lhs_lo;
268
269
270#elif HI_COMPILER == HI_CC_CLANG || HI_COMPILER == HI_CC_GCC
271 auto const lhs = static_cast<unsigned __int128>(lhs_hi) << 64 | static_cast<unsigned __int128>(lhs_lo);
272 return static_cast<uint64_t>(lhs / rhs);
273#else
274 hi_axiom(rhs != 0, "divide by zero.");
275
276 auto R = uint64_t{0};
277 auto Q_hi = uint64_t{0};
278 for (auto mask = 0x8000'0000'0000'0000ULL; mask != 0; mask >>= 1) {
279 R <<= 1;
280 R |= (lhs_hi & mask) != 0 ? 1 : 0;
281
282 hi_axiom(R < rhs, "result overflow");
283 if (R >= rhs) {
284 R -= rhs;
285 Q_hi |= mask;
286 }
287 }
288
289 auto Q_lo = uint64_t{0};
290 for (auto mask = 0x8000'0000'0000'0000ULL; mask != 0; mask >>= 1) {
291 R <<= 1;
292 R |= (lhs_lo & mask) != 0 ? 1 : 0;
293
294 if (R >= rhs) {
295 R -= rhs;
296 Q_lo |= mask;
297 }
298 }
299
300 return Q_lo;
301#endif
302 }
303}
304
305
312template<std::unsigned_integral T>
313[[nodiscard]] hi_force_inline constexpr ssize_t bsr_carry_chain(T const *lhs, std::size_t n) noexcept
314{
315 constexpr std::size_t bits_per_digit = sizeof(T) * CHAR_BIT;
316
317 for (ssize_t i = static_cast<ssize_t>(n) - 1; i >= 0; i--) {
318 auto tmp = std::countl_zero(lhs[i]);
319 if (tmp < bits_per_digit) {
320 return i * bits_per_digit + bits_per_digit - tmp - 1;
321 }
322 }
323 return -1;
324}
325
333template<std::unsigned_integral T>
334hi_force_inline constexpr void invert_carry_chain(T *r, T const *rhs, std::size_t n) noexcept
335{
336 for (std::size_t i = 0; i != n; ++i) {
337 r[i] = ~rhs[i];
338 }
339}
340
348template<std::unsigned_integral T>
349hi_force_inline constexpr void sll_carry_chain(T *r, T const *lhs, std::size_t rhs, std::size_t n) noexcept
350{
351 constexpr std::size_t bits_per_digit = sizeof(T) * CHAR_BIT;
352
353 auto const digit_count = static_cast<ssize_t>(rhs / bits_per_digit);
354 auto const bit_count = rhs % bits_per_digit;
355
356 if (r != lhs or digit_count > 0) {
357 ssize_t i;
358 for (i = static_cast<ssize_t>(n) - 1; i >= digit_count; --i) {
359 r[i] = lhs[i - digit_count];
360 }
361 for (; i >= 0; --i) {
362 r[i] = T{0};
363 }
364 }
365
366 if (bit_count > 0) {
367 auto carry = T{0};
368 for (std::size_t i = 0; i != n; ++i) {
369 std::tie(r[i], carry) = sll_carry(r[i], bit_count, carry);
370 }
371 }
372}
373
381template<std::unsigned_integral T>
382hi_force_inline constexpr void srl_carry_chain(T *r, T const *lhs, std::size_t rhs, std::size_t n) noexcept
383{
384 constexpr std::size_t bits_per_digit = sizeof(T) * CHAR_BIT;
385
386 auto const digit_count = rhs / bits_per_digit;
387 auto const bit_count = rhs % bits_per_digit;
388
389 if (r != lhs or digit_count > 0) {
390 std::size_t i = 0;
391 for (; i != (n - digit_count); ++i) {
392 r[i] = lhs[i + digit_count];
393 }
394 for (; i != n; ++i) {
395 r[i] = T{0};
396 }
397 }
398
399 if (bit_count > 0) {
400 auto carry = T{0};
401
402 for (ssize_t i = static_cast<ssize_t>(n) - digit_count - 1; i >= 0; --i) {
403 std::tie(r[i], carry) = srl_carry(r[i], bit_count, carry);
404 }
405 }
406}
407
416template<std::unsigned_integral T>
417hi_force_inline constexpr void sra_carry_chain(T *r, T const *lhs, std::size_t rhs, std::size_t n) noexcept
418{
419 using S = std::make_signed_t<T>;
420 constexpr std::size_t bits_per_digit = sizeof(T) * CHAR_BIT;
421
422 auto const digit_count = rhs / bits_per_digit;
423 auto const bit_count = rhs % bits_per_digit;
424
425 if (r != lhs or digit_count > 0) {
426 hi_axiom(digit_count < n);
427
428 std::size_t i = 0;
429 for (; i != (n - digit_count); ++i) {
430 r[i] = lhs[i + digit_count];
431 }
432
433 // Sign extent the digits that are unused after a large shift.
434 auto const sign = lhs[n - 1] < 0 ? S{-1} : S{0};
435 for (; i != n; ++i) {
436 r[i] = sign;
437 }
438 }
439
440 if (bit_count > 0) {
441 hi_axiom(n > 0);
442 auto carry = T{};
443
444 // The most significant digit is sign extended.
445 ssize_t i = static_cast<ssize_t>(n) - digit_count - 1;
446 std::tie(r[i], carry) = sra_carry(r[i], bit_count);
447 --i;
448
449 // The rest of the digits pass through the carry.
450 for (; i >= 0; --i) {
451 std::tie(r[i], carry) = srl_carry(r[i], bit_count, carry);
452 }
453 }
454}
455
463template<std::unsigned_integral T>
464hi_force_inline constexpr void and_carry_chain(T *r, T const *lhs, T const *rhs, std::size_t n) noexcept
465{
466 for (std::size_t i = 0; i != n; ++i) {
467 r[i] = lhs[i] & rhs[i];
468 }
469}
470
478template<std::unsigned_integral T>
479hi_force_inline constexpr void or_carry_chain(T *r, T const *lhs, T const *rhs, std::size_t n) noexcept
480{
481 for (std::size_t i = 0; i != n; ++i) {
482 r[i] = lhs[i] | rhs[i];
483 }
484}
485
493template<std::unsigned_integral T>
494hi_force_inline constexpr void xor_carry_chain(T *r, T const *lhs, T const *rhs, std::size_t n) noexcept
495{
496 for (std::size_t i = 0; i != n; ++i) {
497 r[i] = lhs[i] ^ rhs[i];
498 }
499}
500
501template<std::unsigned_integral T>
502[[nodiscard]] hi_force_inline constexpr bool eq_carry_chain(T const *lhs, T const *rhs, std::size_t n) noexcept
503{
504 for (std::size_t i = 0; i != n; ++i) {
505 if (lhs[i] != rhs[i]) {
506 return false;
507 }
508 }
509 return true;
510}
511
512template<std::unsigned_integral T>
513[[nodiscard]] hi_force_inline constexpr bool ne_carry_chain(T const *lhs, T const *rhs, std::size_t n) noexcept
514{
515 return not eq_carry_chain(lhs, rhs, n);
516}
517
518template<std::unsigned_integral T>
519[[nodiscard]] hi_force_inline constexpr std::strong_ordering
520cmp_unsigned_carry_chain(T const *lhs, T const *rhs, std::size_t n) noexcept
521{
522 for (ssize_t i = static_cast<ssize_t>(n) - 1; i >= 0; --i) {
523 auto const r = lhs[i] <=> rhs[i];
524 if (r != std::strong_ordering::equal) {
525 return r;
526 }
527 }
528 return std::strong_ordering::equal;
529}
530
531template<std::unsigned_integral T>
532[[nodiscard]] hi_force_inline constexpr std::strong_ordering cmp_signed_carry_chain(T const *lhs, T const *rhs, std::size_t n) noexcept
533{
534 using S = std::make_signed_t<T>;
535
536 // Compare the ms-digit using signed comparison, because it includes the sign-bit
537 if (n > 0) {
538 auto const r = static_cast<S>(lhs[n - 1]) <=> static_cast<S>(rhs[n - 1]);
539 if (r != std::strong_ordering::equal) {
540 return r;
541 }
542 }
543
544 // At this point both values have the same sign, and since the rest of the digits
545 // do not have a sign bit, use unsigned comparison.
546 for (ssize_t i = static_cast<ssize_t>(n) - 2; i >= 0; --i) {
547 auto const r = lhs[i] <=> rhs[i];
548 if (r != std::strong_ordering::equal) {
549 return r;
550 }
551 }
552 return std::strong_ordering::equal;
553}
554
555template<std::unsigned_integral T>
556[[nodiscard]] hi_force_inline constexpr bool lt_unsigned_carry_chain(T const *lhs, T const *rhs, std::size_t n) noexcept
557{
558 return cmp_unsigned_carry_chain(lhs, rhs, n) == std::strong_ordering::less;
559}
560
561template<std::unsigned_integral T>
562[[nodiscard]] hi_force_inline constexpr bool gt_unsigned_carry_chain(T const *lhs, T const *rhs, std::size_t n) noexcept
563{
564 return cmp_unsigned_carry_chain(lhs, rhs, n) == std::strong_ordering::greater;
565}
566
567template<std::unsigned_integral T>
568[[nodiscard]] hi_force_inline constexpr bool ge_unsigned_carry_chain(T const *lhs, T const *rhs, std::size_t n) noexcept
569{
570 return lt_unsigned_carry_chain(rhs, lhs, n);
571}
572
573template<std::unsigned_integral T>
574[[nodiscard]] hi_force_inline constexpr bool le_unsigned_carry_chain(T const *lhs, T const *rhs, std::size_t n) noexcept
575{
576 return gt_unsigned_carry_chain(rhs, lhs, n);
577}
578
586template<std::unsigned_integral T>
587hi_force_inline constexpr void neg_carry_chain(T *r, T const *rhs, std::size_t n) noexcept
588{
589 auto carry = T{1};
590 for (std::size_t i = 0; i != n; ++i) {
591 std::tie(r[i], carry) = add_carry(~rhs[i], T{0}, carry);
592 }
593}
594
602template<std::unsigned_integral T>
603hi_force_inline constexpr void add_carry_chain(T *r, T const *lhs, T const *rhs, std::size_t n) noexcept
604{
605 auto carry = T{0};
606 for (std::size_t i = 0; i != n; ++i) {
607 std::tie(r[i], carry) = add_carry(lhs[i], rhs[i], carry);
608 }
609}
610
618template<std::unsigned_integral T>
619hi_force_inline constexpr void sub_carry_chain(T *r, T const *lhs, T const *rhs, std::size_t n) noexcept
620{
621 auto carry = T{1};
622 for (std::size_t i = 0; i != n; ++i) {
623 std::tie(r[i], carry) = add_carry(lhs[i], ~rhs[i], carry);
624 }
625}
626
635template<std::unsigned_integral T>
636hi_force_inline constexpr void mul_carry_chain(T *hi_restrict r, T const *lhs, T const *rhs, std::size_t n) noexcept
637{
638 hi_axiom(r != lhs and r != rhs);
639
640 for (auto rhs_index = 0; rhs_index < n; rhs_index++) {
641 auto const rhs_digit = rhs[rhs_index];
642
643 T carry = 0;
644 for (auto lhs_index = 0; (lhs_index + rhs_index) < n; lhs_index++) {
645 auto const lhs_digit = lhs[lhs_index];
646
647 T result;
648 T accumulator = r[rhs_index + lhs_index];
649 std::tie(result, carry) = mul_carry(lhs_digit, rhs_digit, carry, accumulator);
650 r[rhs_index + lhs_index] = result;
651 }
652 }
653}
654
665template<std::unsigned_integral T>
666constexpr void div_carry_chain(T *hi_restrict quotient, T *hi_restrict remainder, T const *lhs, T const *rhs, std::size_t n) noexcept
667{
668 hi_axiom(quotient != lhs and quotient != rhs and quotient != remainder);
669 hi_axiom(remainder != lhs and remainder != rhs);
670
671 auto const nr_bits = static_cast<ssize_t>(n * sizeof(T) * CHAR_BIT);
672
673 for (ssize_t i = nr_bits - 1; i >= 0; i--) {
674 sll_carry_chain(remainder, remainder, 1, n);
675 remainder[0] |= get_bit(lhs, i);
676 if (ge_unsigned_carry_chain(remainder, rhs, n)) {
677 sub_carry_chain(remainder, remainder, rhs, n);
678 set_bit(quotient, i);
679 }
680 }
681}
682
694template<std::unsigned_integral T>
695constexpr void
696signed_div_carry_chain(T *hi_restrict quotient, T *hi_restrict remainder, T const *lhs, T const *rhs, std::size_t n) noexcept
697{
698 hi_axiom(n > 0);
699 hi_axiom(quotient != lhs and quotient != rhs and quotient != remainder);
700 hi_axiom(remainder != lhs and remainder != rhs);
701
702 using signed_type = std::make_signed_t<T>;
703
704 auto const lhs_is_negative = static_cast<signed_type>(lhs[n - 1]) < 0;
705 auto const rhs_is_negative = static_cast<signed_type>(rhs[n - 1]) < 0;
706
707 auto tmp = std::vector<T>{};
708 if (lhs_is_negative or rhs_is_negative) {
709 // Allocate room for negative lhs and rhs together, so only one allocation is needed.
710 tmp.resize(n * 2);
711
712 if (lhs_is_negative) {
713 // Negate lhs and point it to the tmp array.
714 T *p = tmp.data();
715 neg_carry_chain(p, lhs, n);
716 lhs = p;
717 }
718
719 if (rhs_is_negative) {
720 // Negate rhs and point it to the tmp array.
721 T *p = tmp.data() * n;
722 neg_carry_chain(p, rhs, n);
723 rhs = p;
724 }
725 }
726
727 div_carry_chain(quotient, remainder, lhs, rhs, n);
728
729 if (lhs_is_negative != rhs_is_negative) {
730 // Negate the result if the sign of lhs and rhs are different.
731 neg_carry_chain(quotient, quotient, n);
732 }
733 if (lhs_is_negative) {
734 // Remainder has same sign as divisor.
735 neg_carry_chain(remainder, remainder, n);
736 }
737}
738
739} // namespace hi
740
741hi_warning_pop();
The HikoGUI namespace.
Definition array_generic.hpp:20
T resize(T... args)
T tie(T... args)