-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Closed
Labels
Description
I notice that the QueryPerformanceCounter-based steady_clock has a specialization for handling a 10MHz QueryPerformanceFrequency value:
Lines 673 to 681 in 40640c6
| // 10 MHz is a very common QPC frequency on modern PCs. Optimizing for | |
| // this specific frequency can double the performance of this function by | |
| // avoiding the expensive frequency conversion path. | |
| constexpr long long _TenMHz = 10'000'000; | |
| if (_Freq == _TenMHz) { | |
| static_assert(period::den % _TenMHz == 0, "It should never fail."); | |
| constexpr long long _Multiplier = period::den / _TenMHz; | |
| return time_point(duration(_Ctr * _Multiplier)); | |
| } else { |
The 10MHz frequency value is common on x86, but on Windows ARM64 I notice the frequency value is frequently (always?) 24MHz.
Perhaps both common frequencies could be handled using something like this (excerpt from one of my projects, which implements its own chrono-compatible clocks):
namespace timers
{
namespace detail
{
__forceinline int64_t clock_scale_generic(int64_t _freq, int64_t _counter, int64_t _period_den)
{
// Instead of just having "(_Ctr * period::den) / _Freq",
// the algorithm below prevents overflow when _Ctr is sufficiently large.
// It assumes that _Freq * period::den does not overflow, which is currently true for nano period.
// It is not realistic for _Ctr to accumulate to large values from zero with this assumption,
// but the initial value of _Ctr could be large.
const int64_t whole = (_counter / _freq) * _period_den;
const int64_t part = (_counter % _freq) * _period_den / _freq;
return whole + part;
}
// Specialization: if the frequency value is 10MHz, use this function. The
// compiler recognizes the constants for frequency and time period and uses
// shifts and multiplies instead of divides to calculate the nanosecond value.
// This frequency is common on 64-bit x86 Windows.
__forceinline int64_t clock_scale_10MHz_nano(int64_t _counter)
{
constexpr int64_t _freq = 10000000LL;
constexpr int64_t _period_den = nano::period::den;
return detail::clock_scale_generic(_freq, _counter, _period_den);
}
// Specialization: if the frequency value is 24MHz, use this function. The
// compiler recognizes the constants for frequency and time period and uses
// shifts and multiplies instead of divides to calculate the nanosecond value.
// This frequency is common on ARM64 (Windows devices, and Apple Silicon Macs
// using Parallels Desktop)
__forceinline int64_t clock_scale_24MHz_nano(int64_t _counter)
{
constexpr int64_t _freq = 24000000LL;
constexpr int64_t _period_den = nano::period::den;
return detail::clock_scale_generic(_freq, _counter, _period_den);
}
#if defined(_M_ARM) || defined(_M_ARM64)
#define LIKELY_ARM [[likely]]
#define LIKELY_X86
#elif defined(_M_IX86) || defined(_M_X64)
#define LIKELY_ARM
#define LIKELY_X86 [[likely]]
#else
#define LIKELY_ARM
#define LIKELY_X86
#endif
__forceinline int64_t clock_scale_win32_qpc_nano(int64_t freq, int64_t counter)
{
switch (freq) {
LIKELY_X86 case 10000000LL: return detail::clock_scale_10MHz_nano(counter);
LIKELY_ARM case 24000000LL: return detail::clock_scale_24MHz_nano(counter);
}
return detail::clock_scale_generic(freq, counter, nano::period::den);
}
#undef LIKELY_ARM
#undef LIKELY_X86
__forceinline int64_t query_performance_frequency()
{
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
return freq.QuadPart;
}
__forceinline int64_t query_performance_counter()
{
LARGE_INTEGER ctr;
QueryPerformanceCounter(&ctr);
return ctr.QuadPart;
}
} // namespace detail
struct win32_qpc
{
using duration = std::chrono::nanoseconds;
using rep = duration::rep;
using period = duration::period;
using time_point = std::chrono::time_point<win32_qpc>;
static const bool is_steady = true;
static __forceinline time_point now() noexcept
{
static_assert(period::num == 1, "This assumes period::num == 1");
const int64_t freq = detail::query_performance_frequency();
const int64_t counter = detail::query_performance_counter();
return time_point(duration(detail::clock_scale_win32_qpc_nano(freq, counter)));
}
};
} // namespace timersI'm not sure how well the Microsoft Visual C++ compiler's optimizer handles this, but clang-cl does a great job with inlining and simplfying the integer divisions into shifts and multiplies.
Reactions are currently unavailable