HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
time_stamp_count.hpp
1// Copyright Take Vos 2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "../utility/utility.hpp"
8#include "../concurrency/concurrency.hpp"
9#include "../numeric/module.hpp"
10#include "../macros.hpp"
11#include <atomic>
12#include <array>
13#include <cstdint>
14
15#if HI_OPERATING_SYSTEM == HI_OS_WINDOWS
16#include <intrin.h>
17#elif HI_OPERATING_SYSTEM == HI_OS_LINUX
18#include <x86intrin.h>
19#endif
20
21
22
23namespace hi::inline v1 {
24
32public:
33 struct inplace {};
36
37 constexpr time_stamp_count() noexcept : _count(0), _aux(0), _thread_id(0) {}
38
39 constexpr time_stamp_count(uint64_t count, uint32_t aux) noexcept : _count(count), _aux(aux), _thread_id(0) {}
40
43 explicit time_stamp_count(time_stamp_count::inplace) noexcept : _aux(0), _thread_id(0)
44 {
45#if HI_PROCESSOR == HI_CPU_X64
46 uint32_t tmp;
47 _count = __rdtscp(&tmp);
48#else
49#error "Not Implemented"
50#endif
51 }
52
56 {
57#if HI_PROCESSOR == HI_CPU_X64
58 _count = __rdtscp(&_aux);
59#else
60#error "Not Implemented"
61#endif
62 }
63
67 {
68#if HI_PROCESSOR == HI_CPU_X64 and HI_OPERATING_SYSTEM == HI_OS_WINDOWS
69 constexpr uint64_t NT_TIB_CurrentThreadID = 0x48;
70
71 _count = __rdtscp(&_aux);
72 _thread_id = __readgsdword(NT_TIB_CurrentThreadID);
73#else
74#error "Not Implemented"
75#endif
76 }
77
81 [[nodiscard]] static time_stamp_count now() noexcept
82 {
84 }
85
92 [[nodiscard]] ssize_t cpu_id() const noexcept
93 {
94 if (_aux_is_cpu_id.load(std::memory_order::relaxed)) {
95 // On Linux the upper bits are used for a node-id.
96 return _aux & 0xfff;
97 } else {
98 return cpu_id_fallback();
99 }
100 }
101
105 [[nodiscard]] constexpr uint32_t thread_id() const noexcept
106 {
107 return _thread_id;
108 }
109
114 [[nodiscard]] constexpr uint64_t count() const noexcept
115 {
116 return _count;
117 }
118
124 [[nodiscard]] static std::chrono::nanoseconds duration_from_count(uint64_t count) noexcept
125 {
126 using namespace std::chrono_literals;
127
128 hilet[lo, hi] = mul_carry(count, _period.load(std::memory_order::relaxed));
129 return 1ns * static_cast<int64_t>((hi << 32) | (lo >> 32));
130 }
131
136 [[nodiscard]] std::chrono::nanoseconds time_since_epoch() const noexcept
137 {
138 return duration_from_count(_count);
139 }
140
141 constexpr time_stamp_count& operator+=(uint64_t rhs) noexcept
142 {
143 _count += rhs;
144 return *this;
145 }
146
147 [[nodiscard]] constexpr time_stamp_count operator+(uint64_t rhs) const noexcept
148 {
149 auto tmp = *this;
150 tmp += rhs;
151 return tmp;
152 }
153
161 {
162 auto shortest_diff = std::numeric_limits<uint64_t>::max();
163 time_stamp_count shortest_tsc;
164 utc_nanoseconds shortest_tp;
165
166 // With three samples gathered on the same CPU we should
167 // have a TSC/UTC/TSC combination that was run inside a single time-slice.
168 for (auto i = 0; i != 10; ++i) {
169 hilet tmp_tsc1 = time_stamp_count::now();
170 hilet tmp_tp = std::chrono::utc_clock::now();
171 hilet tmp_tsc2 = time_stamp_count::now();
172
173 if (tmp_tsc1.cpu_id() != tmp_tsc2.cpu_id()) {
174 throw os_error("CPU Switch detected during get_sample(), which should never happen");
175 }
176
177 if (tmp_tsc1.count() > tmp_tsc2.count()) {
178 // TSC skipped backwards, this may happen when the TSC of multiple
179 // CPUs get synchronized with each other.
180 // For example when waking up from sleep.
181 continue;
182 }
183
184 hilet diff = tmp_tsc2.count() - tmp_tsc1.count();
185
186 if (diff < shortest_diff) {
187 shortest_diff = diff;
188 shortest_tp = tmp_tp;
189 shortest_tsc = tmp_tsc1 + (diff / 2);
190 }
191 }
192
193 if (shortest_diff == std::numeric_limits<uint64_t>::max()) {
194 throw os_error("Unable to get TSC sample.");
195 }
196
197 return {shortest_tp, shortest_tsc};
198 }
199
208 [[nodiscard]] static uint64_t measure_frequency(std::chrono::milliseconds sample_duration)
209 {
210 using namespace std::chrono_literals;
211
212 // Only sample the frequency of one of the TSC clocks.
213 hilet prev_mask = set_thread_affinity(current_cpu_id());
214
215 hilet [tp1, tsc1] = time_stamp_utc_sample();
216 std::this_thread::sleep_for(sample_duration);
217 hilet [tp2, tsc2] = time_stamp_utc_sample();
218
219 // Reset the mask back.
220 set_thread_affinity_mask(prev_mask);
221
222 if (tsc1._aux != tsc2._aux) {
223 // This must never happen, as we set the thread affinity to a single CPU
224 // if this happens something is seriously wrong.
225 throw os_error("CPU Switch detected when measuring the TSC frequency.");
226 }
227
228 if (tsc1.count() >= tsc2.count()) {
229 // The TSC should only be reset during the very early boot sequence when
230 // the CPUs are started and synchronized. It may also happen to a CPU that
231 // was hot-swapped while the computer is running, in that case the CPU
232 // should not be running applications yet.
233 throw os_error("TSC Did not advance during measuring its frequency.");
234 }
235
236 if (tp1 >= tp2) {
237 // The UTC clock did not advance, maybe a time server changed the clock.
238 return 0;
239 }
240
241 // Calculate the frequency by dividing the delta-tsc by the duration.
242 // We scale both the delta-tsc and duration by 1'000'000'000 before the
243 // division. The duration is scaled by 1'000'000'000 by dividing by 1ns.
244 hilet[delta_tsc_lo, delta_tsc_hi] = mul_carry(tsc2.count() - tsc1.count(), uint64_t{1'000'000'000});
245 auto duration = narrow_cast<uint64_t>((tp2 - tp1) / 1ns);
246 return wide_div(delta_tsc_lo, delta_tsc_hi, duration);
247 }
248
249 static void set_frequency(uint64_t frequency) noexcept
250 {
251 hilet period = (uint64_t{1'000'000'000} << 32) / frequency;
252 _period.store(period, std::memory_order_relaxed);
253 }
254
262 {
263 hilet frequency = configure_frequency();
264 hilet aux_is_cpu_id = populate_aux_values();
265 return {frequency, aux_is_cpu_id};
266 }
267
268private:
269 uint64_t _count;
270
278 uint32_t _aux;
279
282 uint32_t _thread_id;
283
286 inline static std::atomic<uint64_t> _period = 0;
287
288 inline static std::atomic<bool> _aux_is_cpu_id = false;
289
292 inline static std::atomic<std::size_t> _num_aux_values = 0;
293
296 inline static std::array<uint32_t, maximum_num_cpus> _aux_values;
297
300 inline static std::array<std::size_t, maximum_num_cpus> _cpu_ids;
301
309 [[nodiscard]] ssize_t cpu_id_fallback() const noexcept
310 {
311 auto aux_value_ = _mm_set1_epi32(_aux);
312
313 hilet num_aux_values = _num_aux_values.load(std::memory_order_acquire);
314 hi_assert(_aux_values.size() == _cpu_ids.size());
315 hi_assert_bounds(num_aux_values, _aux_values);
316
317 for (std::size_t i = 0; i < num_aux_values; i += 4) {
318 hilet row = _mm_loadu_si128(reinterpret_cast<__m128i const *>(_aux_values.data() + i));
319 hilet row_result = _mm_cmpeq_epi32(row, aux_value_);
320 hilet row_result_ = _mm_castsi128_ps(row_result);
321 hilet row_result_mask = _mm_movemask_ps(row_result_);
322 if (to_bool(row_result_mask)) {
323 hilet j = i + std::countr_zero(narrow_cast<unsigned int>(row_result_mask));
324 if (j < num_aux_values) {
325 return _cpu_ids[j];
326 }
327
328 return -1;
329 }
330 }
331
332 return -1;
333 }
334
335 static bool populate_aux_values()
336 {
337 // Keep track of the original thread affinity of the main thread.
338 auto prev_mask = set_thread_affinity(current_cpu_id());
339
340 // Create a table of cpu_ids.
341 std::size_t next_cpu = 0;
342 std::size_t current_cpu = 0;
343 bool aux_is_cpu_id = true;
344 do {
345 current_cpu = advance_thread_affinity(next_cpu);
346
347 auto i = _num_aux_values.load(std::memory_order::acquire);
348 auto tsc = time_stamp_count::now();
349 _aux_values[i] = tsc._aux;
350 _cpu_ids[i] = current_cpu;
351 _num_aux_values.store(i + 1, std::memory_order::release);
352
353 if ((tsc._aux & 0xfff) != current_cpu) {
354 aux_is_cpu_id = false;
355 }
356
357 } while (next_cpu > current_cpu);
358
359 _aux_is_cpu_id.store(aux_is_cpu_id, std::memory_order_relaxed);
360
361 // Set the thread affinity back to the original.
362 set_thread_affinity_mask(prev_mask);
363 return aux_is_cpu_id;
364 }
365 static uint64_t configure_frequency()
366 {
367 using namespace std::chrono_literals;
368
369 // This function is called from the crt and must therefor be quick as we do not
370 // want to keep the user waiting. We are satisfied if the measured frequency is
371 // to within 1% accuracy.
372
373 // We take an average over 4 times in case the hires_utc_clock gets reset by a time server.
374 uint64_t frequency = 0;
375 uint64_t num_samples = 0;
376 for (int i = 0; i != 4; ++i) {
377 hilet f = time_stamp_count::measure_frequency(25ms);
378 if (f != 0) {
379 frequency += f;
380 ++num_samples;
381 }
382 }
383 if (num_samples == 0) {
384 throw os_error("Unable the measure the frequency of the TSC. The UTC time did not advance.");
385 }
386 frequency /= num_samples;
387
388 time_stamp_count::set_frequency(frequency);
389 return frequency;
390 }
391};
392
393} // namespace hi::inline v1
std::vector< bool > set_thread_affinity_mask(std::vector< bool > const &mask)
Set the current thread CPU affinity mask.
std::size_t current_cpu_id() noexcept
Get the current CPU id.
std::size_t advance_thread_affinity(std::size_t &cpu) noexcept
Advance thread affinity to the next CPU.
Definition thread_intf.hpp:122
std::vector< bool > set_thread_affinity(std::size_t cpu_id)
Set the current thread CPU affinity to a single CPU.
Definition thread_intf.hpp:104
DOXYGEN BUG.
Definition algorithm.hpp:16
geometry/margins.hpp
Definition lookahead_iterator.hpp:5
constexpr Out narrow_cast(In const &rhs) noexcept
Cast numeric values without loss of precision.
Definition cast.hpp:377
Since Window's 10 QueryPerformanceCounter() counts at only 10MHz which is too low to measure performa...
Definition time_stamp_count.hpp:31
constexpr uint32_t thread_id() const noexcept
Get the thread id.
Definition time_stamp_count.hpp:105
time_stamp_count(time_stamp_count::inplace_with_cpu_id) noexcept
Use a constructor to in-place create the timestamp.
Definition time_stamp_count.hpp:55
static time_stamp_count now() noexcept
Get the current count from the CPU's time stamp count.
Definition time_stamp_count.hpp:81
time_stamp_count(time_stamp_count::inplace) noexcept
Use a constructor to in-place create the timestamp.
Definition time_stamp_count.hpp:43
static std::pair< uint64_t, bool > start_subsystem()
Start the time_stamp_count subsystem.
Definition time_stamp_count.hpp:261
static std::pair< utc_nanoseconds, time_stamp_count > time_stamp_utc_sample()
Get a good quality time sample.
Definition time_stamp_count.hpp:160
ssize_t cpu_id() const noexcept
Get the logical CPU index.
Definition time_stamp_count.hpp:92
time_stamp_count(time_stamp_count::inplace_with_thread_id) noexcept
Use a constructor to in-place create the timestamp.
Definition time_stamp_count.hpp:66
static uint64_t measure_frequency(std::chrono::milliseconds sample_duration)
Measure the frequency of the time_stamp_count.
Definition time_stamp_count.hpp:208
static std::chrono::nanoseconds duration_from_count(uint64_t count) noexcept
Convert a time-stamp count to a duration.
Definition time_stamp_count.hpp:124
std::chrono::nanoseconds time_since_epoch() const noexcept
Convert to nanoseconds since epoch.
Definition time_stamp_count.hpp:136
constexpr uint64_t count() const noexcept
Get the count since epoch.
Definition time_stamp_count.hpp:114
Definition time_stamp_count.hpp:33
Definition time_stamp_count.hpp:34
Definition time_stamp_count.hpp:35
T data(T... args)
T load(T... args)
T max(T... args)
T size(T... args)
T sleep_for(T... args)
T store(T... args)