HikoGUI
A low latency retained GUI
Loading...
Searching...
No Matches
time_stamp_count.hpp
1// Copyright Take Vos 2021.
2// Distributed under the Boost Software License, Version 1.0.
3// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
4
5#pragma once
6
7#include "chrono.hpp"
8#include "../utility/utility.hpp"
9#include "../concurrency/concurrency.hpp"
10#include "../numeric/numeric.hpp"
11#include "../macros.hpp"
12#include <atomic>
13#include <array>
14#include <cstdint>
15#include <chrono>
16#include <utility>
17#include <thread>
18
19#if HI_OPERATING_SYSTEM == HI_OS_WINDOWS
20#include <intrin.h>
21#elif HI_OPERATING_SYSTEM == HI_OS_LINUX
22#include <x86intrin.h>
23#endif
24
25hi_export_module(hikogui.time.time_stamp_count);
26
27
28hi_export namespace hi::inline v1 {
29
37public:
38 struct inplace {};
41
42 constexpr time_stamp_count() noexcept : _count(0), _aux(0), _thread_id(0) {}
43
44 constexpr time_stamp_count(uint64_t count, uint32_t aux) noexcept : _count(count), _aux(aux), _thread_id(0) {}
45
48 explicit time_stamp_count(time_stamp_count::inplace) noexcept : _aux(0), _thread_id(0)
49 {
50#if HI_PROCESSOR == HI_CPU_X86_64
51 uint32_t tmp;
52 _count = __rdtscp(&tmp);
53#else
54 _count = std::chrono::steady_clock::now().time_since_epoch().count();
55#endif
56 }
57
61 {
62#if HI_PROCESSOR == HI_CPU_X86_64
63 _count = __rdtscp(&_aux);
64#else
65 _count = std::chrono::steady_clock::now().time_since_epoch().count();
66 _aux = 0;
67#endif
68 }
69
73 {
74#if HI_PROCESSOR == HI_CPU_X86_64 and HI_OPERATING_SYSTEM == HI_OS_WINDOWS
75 constexpr uint64_t NT_TIB_CurrentThreadID = 0x48;
76
77 _count = __rdtscp(&_aux);
78 _thread_id = __readgsdword(NT_TIB_CurrentThreadID);
79#else
80 _count = std::chrono::steady_clock::now().time_since_epoch().count();
81 _thread_id = 0;
82 _aux = 0;
83#endif
84 }
85
89 [[nodiscard]] static time_stamp_count now() noexcept
90 {
92 }
93
100 [[nodiscard]] ssize_t cpu_id() const noexcept
101 {
102 if (_aux_is_cpu_id.load(std::memory_order::relaxed)) {
103 // On Linux the upper bits are used for a node-id.
104 return _aux & 0xfff;
105 } else {
106 return cpu_id_fallback();
107 }
108 }
109
113 [[nodiscard]] constexpr uint32_t thread_id() const noexcept
114 {
115 return _thread_id;
116 }
117
122 [[nodiscard]] constexpr uint64_t count() const noexcept
123 {
124 return _count;
125 }
126
132 [[nodiscard]] static std::chrono::nanoseconds duration_from_count(uint64_t count) noexcept
133 {
134 using namespace std::chrono_literals;
135
136 auto const[lo, hi] = mul_carry(count, _period.load(std::memory_order::relaxed));
137 return 1ns * static_cast<int64_t>((hi << 32) | (lo >> 32));
138 }
139
144 [[nodiscard]] std::chrono::nanoseconds time_since_epoch() const noexcept
145 {
146 return duration_from_count(_count);
147 }
148
149 constexpr time_stamp_count& operator+=(uint64_t rhs) noexcept
150 {
151 _count += rhs;
152 return *this;
153 }
154
155 [[nodiscard]] constexpr time_stamp_count operator+(uint64_t rhs) const noexcept
156 {
157 auto tmp = *this;
158 tmp += rhs;
159 return tmp;
160 }
161
169 {
170 auto shortest_diff = std::numeric_limits<uint64_t>::max();
171 time_stamp_count shortest_tsc;
172 utc_nanoseconds shortest_tp;
173
174 // With three samples gathered on the same CPU we should
175 // have a TSC/UTC/TSC combination that was run inside a single time-slice.
176 for (auto i = 0; i != 10; ++i) {
177 auto const tmp_tsc1 = time_stamp_count::now();
178 auto const tmp_tp = std::chrono::utc_clock::now();
179 auto const tmp_tsc2 = time_stamp_count::now();
180
181 if (tmp_tsc1.cpu_id() != tmp_tsc2.cpu_id()) {
182 throw os_error("CPU Switch detected during get_sample(), which should never happen");
183 }
184
185 if (tmp_tsc1.count() > tmp_tsc2.count()) {
186 // TSC skipped backwards, this may happen when the TSC of multiple
187 // CPUs get synchronized with each other.
188 // For example when waking up from sleep.
189 continue;
190 }
191
192 auto const diff = tmp_tsc2.count() - tmp_tsc1.count();
193
194 if (diff < shortest_diff) {
195 shortest_diff = diff;
196 shortest_tp = tmp_tp;
197 shortest_tsc = tmp_tsc1 + (diff / 2);
198 }
199 }
200
201 if (shortest_diff == std::numeric_limits<uint64_t>::max()) {
202 throw os_error("Unable to get TSC sample.");
203 }
204
205 return {shortest_tp, shortest_tsc};
206 }
207
216 [[nodiscard]] static uint64_t measure_frequency(std::chrono::milliseconds sample_duration)
217 {
218 using namespace std::chrono_literals;
219
220 // Only sample the frequency of one of the TSC clocks.
221 auto const prev_mask = set_thread_affinity(current_cpu_id());
222
223 auto const [tp1, tsc1] = time_stamp_utc_sample();
224 std::this_thread::sleep_for(sample_duration);
225 auto const [tp2, tsc2] = time_stamp_utc_sample();
226
227 // Reset the mask back.
228 set_thread_affinity_mask(prev_mask);
229
230 if (tsc1._aux != tsc2._aux) {
231 // This must never happen, as we set the thread affinity to a single CPU
232 // if this happens something is seriously wrong.
233 throw os_error("CPU Switch detected when measuring the TSC frequency.");
234 }
235
236 if (tsc1.count() >= tsc2.count()) {
237 // The TSC should only be reset during the very early boot sequence when
238 // the CPUs are started and synchronized. It may also happen to a CPU that
239 // was hot-swapped while the computer is running, in that case the CPU
240 // should not be running applications yet.
241 throw os_error("TSC Did not advance during measuring its frequency.");
242 }
243
244 if (tp1 >= tp2) {
245 // The UTC clock did not advance, maybe a time server changed the clock.
246 return 0;
247 }
248
249 // Calculate the frequency by dividing the delta-tsc by the duration.
250 // We scale both the delta-tsc and duration by 1'000'000'000 before the
251 // division. The duration is scaled by 1'000'000'000 by dividing by 1ns.
252 auto const[delta_tsc_lo, delta_tsc_hi] = mul_carry(tsc2.count() - tsc1.count(), uint64_t{1'000'000'000});
253 auto duration = narrow_cast<uint64_t>((tp2 - tp1) / 1ns);
254 return wide_div(delta_tsc_lo, delta_tsc_hi, duration);
255 }
256
257 static void set_frequency(uint64_t frequency) noexcept
258 {
259 auto const period = (uint64_t{1'000'000'000} << 32) / frequency;
260 _period.store(period, std::memory_order_relaxed);
261 }
262
270 {
271 auto const frequency = configure_frequency();
272 auto const aux_is_cpu_id = populate_aux_values();
273 return {frequency, aux_is_cpu_id};
274 }
275
276private:
277 uint64_t _count;
278
286 uint32_t _aux;
287
290 uint32_t _thread_id;
291
294 inline static std::atomic<uint64_t> _period = 0;
295
296 inline static std::atomic<bool> _aux_is_cpu_id = false;
297
300 inline static std::atomic<std::size_t> _num_aux_values = 0;
301
304 inline static std::array<uint32_t, maximum_num_cpus> _aux_values;
305
308 inline static std::array<std::size_t, maximum_num_cpus> _cpu_ids;
309
317 [[nodiscard]] ssize_t cpu_id_fallback() const noexcept
318 {
319 auto aux_value_ = _mm_set1_epi32(_aux);
320
321 auto const num_aux_values = _num_aux_values.load(std::memory_order_acquire);
322 hi_assert(_aux_values.size() == _cpu_ids.size());
323 hi_assert_bounds(num_aux_values, _aux_values);
324
325 for (std::size_t i = 0; i < num_aux_values; i += 4) {
326 auto const row = _mm_loadu_si128(reinterpret_cast<__m128i const *>(_aux_values.data() + i));
327 auto const row_result = _mm_cmpeq_epi32(row, aux_value_);
328 auto const row_result_ = _mm_castsi128_ps(row_result);
329 auto const row_result_mask = _mm_movemask_ps(row_result_);
330 if (to_bool(row_result_mask)) {
331 auto const j = i + std::countr_zero(narrow_cast<unsigned int>(row_result_mask));
332 if (j < num_aux_values) {
333 return _cpu_ids[j];
334 }
335
336 return -1;
337 }
338 }
339
340 return -1;
341 }
342
343 static bool populate_aux_values()
344 {
345 // Keep track of the original thread affinity of the main thread.
346 auto prev_mask = set_thread_affinity(current_cpu_id());
347
348 // Create a table of cpu_ids.
349 std::size_t next_cpu = 0;
350 std::size_t current_cpu = 0;
351 bool aux_is_cpu_id = true;
352 do {
353 current_cpu = advance_thread_affinity(next_cpu);
354
355 auto i = _num_aux_values.load(std::memory_order::acquire);
356 auto tsc = time_stamp_count::now();
357 _aux_values[i] = tsc._aux;
358 _cpu_ids[i] = current_cpu;
359 _num_aux_values.store(i + 1, std::memory_order::release);
360
361 if ((tsc._aux & 0xfff) != current_cpu) {
362 aux_is_cpu_id = false;
363 }
364
365 } while (next_cpu > current_cpu);
366
367 _aux_is_cpu_id.store(aux_is_cpu_id, std::memory_order_relaxed);
368
369 // Set the thread affinity back to the original.
370 set_thread_affinity_mask(prev_mask);
371 return aux_is_cpu_id;
372 }
373 static uint64_t configure_frequency()
374 {
375 using namespace std::chrono_literals;
376
377 // This function is called from the crt and must therefor be quick as we do not
378 // want to keep the user waiting. We are satisfied if the measured frequency is
379 // to within 1% accuracy.
380
381 // We take an average over 4 times in case the hires_utc_clock gets reset by a time server.
382 uint64_t frequency = 0;
383 uint64_t num_samples = 0;
384 for (int i = 0; i != 4; ++i) {
385 auto const f = time_stamp_count::measure_frequency(25ms);
386 if (f != 0) {
387 frequency += f;
388 ++num_samples;
389 }
390 }
391 if (num_samples == 0) {
392 throw os_error("Unable the measure the frequency of the TSC. The UTC time did not advance.");
393 }
394 frequency /= num_samples;
395
396 time_stamp_count::set_frequency(frequency);
397 return frequency;
398 }
399};
400
401} // namespace hi::inline v1
std::vector< bool > set_thread_affinity_mask(std::vector< bool > const &mask)
Set the current thread CPU affinity mask.
std::size_t current_cpu_id() noexcept
Get the current CPU id.
std::size_t advance_thread_affinity(std::size_t &cpu) noexcept
Advance thread affinity to the next CPU.
Definition thread_intf.hpp:121
std::vector< bool > set_thread_affinity(std::size_t cpu_id)
Set the current thread CPU affinity to a single CPU.
Definition thread_intf.hpp:103
The HikoGUI namespace.
Definition array_generic.hpp:20
DOXYGEN BUG.
Definition algorithm_misc.hpp:20
Since Window's 10 QueryPerformanceCounter() counts at only 10MHz which is too low to measure performa...
Definition time_stamp_count.hpp:36
constexpr uint32_t thread_id() const noexcept
Get the thread id.
Definition time_stamp_count.hpp:113
time_stamp_count(time_stamp_count::inplace_with_cpu_id) noexcept
Use a constructor to in-place create the timestamp.
Definition time_stamp_count.hpp:60
static time_stamp_count now() noexcept
Get the current count from the CPU's time stamp count.
Definition time_stamp_count.hpp:89
time_stamp_count(time_stamp_count::inplace) noexcept
Use a constructor to in-place create the timestamp.
Definition time_stamp_count.hpp:48
static std::pair< uint64_t, bool > start_subsystem()
Start the time_stamp_count subsystem.
Definition time_stamp_count.hpp:269
static std::pair< utc_nanoseconds, time_stamp_count > time_stamp_utc_sample()
Get a good quality time sample.
Definition time_stamp_count.hpp:168
ssize_t cpu_id() const noexcept
Get the logical CPU index.
Definition time_stamp_count.hpp:100
time_stamp_count(time_stamp_count::inplace_with_thread_id) noexcept
Use a constructor to in-place create the timestamp.
Definition time_stamp_count.hpp:72
static uint64_t measure_frequency(std::chrono::milliseconds sample_duration)
Measure the frequency of the time_stamp_count.
Definition time_stamp_count.hpp:216
static std::chrono::nanoseconds duration_from_count(uint64_t count) noexcept
Convert a time-stamp count to a duration.
Definition time_stamp_count.hpp:132
std::chrono::nanoseconds time_since_epoch() const noexcept
Convert to nanoseconds since epoch.
Definition time_stamp_count.hpp:144
constexpr uint64_t count() const noexcept
Get the count since epoch.
Definition time_stamp_count.hpp:122
Definition time_stamp_count.hpp:38
Definition time_stamp_count.hpp:39
Definition time_stamp_count.hpp:40
T data(T... args)
T load(T... args)
T max(T... args)
T size(T... args)
T sleep_for(T... args)
T store(T... args)