From 0a91599aec5a0c4d4c87d3e23fea07dbb6ced0f7 Mon Sep 17 00:00:00 2001 From: bunnei Date: Mon, 15 Feb 2021 14:46:04 -0800 Subject: common: Merge uint128 to a single header file with inlines. --- src/common/CMakeLists.txt | 1 - src/common/uint128.cpp | 71 -------------------------------- src/common/uint128.h | 89 ++++++++++++++++++++++++++++++++++++++--- src/common/x64/native_clock.cpp | 58 --------------------------- 4 files changed, 84 insertions(+), 135 deletions(-) delete mode 100644 src/common/uint128.cpp diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 263c457cd..b657506b1 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -168,7 +168,6 @@ add_library(common STATIC time_zone.cpp time_zone.h tree.h - uint128.cpp uint128.h uuid.cpp uuid.h diff --git a/src/common/uint128.cpp b/src/common/uint128.cpp deleted file mode 100644 index 16bf7c828..000000000 --- a/src/common/uint128.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#ifdef _MSC_VER -#include - -#pragma intrinsic(_umul128) -#pragma intrinsic(_udiv128) -#endif -#include -#include "common/uint128.h" - -namespace Common { - -#ifdef _MSC_VER - -u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) { - u128 r{}; - r[0] = _umul128(a, b, &r[1]); - u64 remainder; -#if _MSC_VER < 1923 - return udiv128(r[1], r[0], d, &remainder); -#else - return _udiv128(r[1], r[0], d, &remainder); -#endif -} - -#else - -u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) { - const u64 diva = a / d; - const u64 moda = a % d; - const u64 divb = b / d; - const u64 modb = b % d; - return diva * b + moda * divb + moda * modb / d; -} - -#endif - -u128 Multiply64Into128(u64 a, u64 b) { - u128 result; -#ifdef _MSC_VER - result[0] = _umul128(a, b, &result[1]); -#else - unsigned __int128 tmp = a; - tmp *= b; - std::memcpy(&result, &tmp, sizeof(u128)); -#endif - return result; -} - -std::pair Divide128On32(u128 dividend, u32 divisor) { - u64 remainder = dividend[0] % divisor; - u64 accum = dividend[0] / divisor; - if (dividend[1] == 0) - return {accum, remainder}; - // We ignore dividend[1] / divisor as that overflows - const u64 first_segment = (dividend[1] % divisor) << 32; - accum += (first_segment / divisor) << 32; - const u64 second_segment = (first_segment % divisor) << 32; - accum += (second_segment / divisor); - remainder += second_segment % divisor; - if (remainder >= divisor) { - accum++; - remainder -= divisor; - } - return {accum, remainder}; -} - -} // namespace Common diff --git a/src/common/uint128.h b/src/common/uint128.h index 969259ab6..83560a9ce 100644 --- a/src/common/uint128.h +++ b/src/common/uint128.h @@ -4,19 +4,98 @@ #pragma once +#include #include + +#ifdef _MSC_VER +#include +#pragma intrinsic(__umulh) +#pragma intrinsic(_umul128) +#pragma intrinsic(_udiv128) +#else +#include +#endif + #include "common/common_types.h" namespace Common { // This function multiplies 2 u64 values and divides it by a u64 value. -[[nodiscard]] u64 MultiplyAndDivide64(u64 a, u64 b, u64 d); +[[nodiscard]] static inline u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) { +#ifdef _MSC_VER + u128 r{}; + r[0] = _umul128(a, b, &r[1]); + u64 remainder; +#if _MSC_VER < 1923 + return udiv128(r[1], r[0], d, &remainder); +#else + return _udiv128(r[1], r[0], d, &remainder); +#endif +#else + const u64 diva = a / d; + const u64 moda = a % d; + const u64 divb = b / d; + const u64 modb = b % d; + return diva * b + moda * divb + moda * modb / d; +#endif +} // This function multiplies 2 u64 values and produces a u128 value; -[[nodiscard]] u128 Multiply64Into128(u64 a, u64 b); +[[nodiscard]] static inline u128 Multiply64Into128(u64 a, u64 b) { + u128 result; +#ifdef _MSC_VER + result[0] = _umul128(a, b, &result[1]); +#else + unsigned __int128 tmp = a; + tmp *= b; + std::memcpy(&result, &tmp, sizeof(u128)); +#endif + return result; +} + +[[nodiscard]] static inline u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) { +#ifdef __SIZEOF_INT128__ + const auto base = static_cast(numerator) << 64ULL; + return static_cast(base / divisor); +#elif defined(_M_X64) || defined(_M_ARM64) + std::array r = {0, numerator}; + u64 remainder; +#if _MSC_VER < 1923 + return udiv128(r[1], r[0], divisor, &remainder); +#else + return _udiv128(r[1], r[0], divisor, &remainder); +#endif +#else + // This one is bit more inaccurate. + return MultiplyAndDivide64(std::numeric_limits::max(), numerator, divisor); +#endif +} + +[[nodiscard]] static inline u64 MultiplyHigh(u64 a, u64 b) { +#ifdef __SIZEOF_INT128__ + return (static_cast(a) * static_cast(b)) >> 64; +#elif defined(_M_X64) || defined(_M_ARM64) + return __umulh(a, b); // MSVC +#else + // Generic fallback + const u64 a_lo = u32(a); + const u64 a_hi = a >> 32; + const u64 b_lo = u32(b); + const u64 b_hi = b >> 32; + + const u64 a_x_b_hi = a_hi * b_hi; + const u64 a_x_b_mid = a_hi * b_lo; + const u64 b_x_a_mid = b_hi * a_lo; + const u64 a_x_b_lo = a_lo * b_lo; + + const u64 carry_bit = (static_cast(static_cast(a_x_b_mid)) + + static_cast(static_cast(b_x_a_mid)) + (a_x_b_lo >> 32)) >> + 32; + + const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit; -// This function divides a u128 by a u32 value and produces two u64 values: -// the result of division and the remainder -[[nodiscard]] std::pair Divide128On32(u128 dividend, u32 divisor); + return multhi; +#endif +} } // namespace Common diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index a65f6b832..87de40624 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -8,68 +8,10 @@ #include #include -#ifdef _MSC_VER -#include - -#pragma intrinsic(__umulh) -#pragma intrinsic(_udiv128) -#else -#include -#endif - #include "common/atomic_ops.h" #include "common/uint128.h" #include "common/x64/native_clock.h" -namespace { - -[[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) { -#ifdef __SIZEOF_INT128__ - const auto base = static_cast(numerator) << 64ULL; - return static_cast(base / divisor); -#elif defined(_M_X64) || defined(_M_ARM64) - std::array r = {0, numerator}; - u64 remainder; -#if _MSC_VER < 1923 - return udiv128(r[1], r[0], divisor, &remainder); -#else - return _udiv128(r[1], r[0], divisor, &remainder); -#endif -#else - // This one is bit more inaccurate. - return MultiplyAndDivide64(std::numeric_limits::max(), numerator, divisor); -#endif -} - -[[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) { -#ifdef __SIZEOF_INT128__ - return (static_cast(a) * static_cast(b)) >> 64; -#elif defined(_M_X64) || defined(_M_ARM64) - return __umulh(a, b); // MSVC -#else - // Generic fallback - const u64 a_lo = u32(a); - const u64 a_hi = a >> 32; - const u64 b_lo = u32(b); - const u64 b_hi = b >> 32; - - const u64 a_x_b_hi = a_hi * b_hi; - const u64 a_x_b_mid = a_hi * b_lo; - const u64 b_x_a_mid = b_hi * a_lo; - const u64 a_x_b_lo = a_lo * b_lo; - - const u64 carry_bit = (static_cast(static_cast(a_x_b_mid)) + - static_cast(static_cast(b_x_a_mid)) + (a_x_b_lo >> 32)) >> - 32; - - const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit; - - return multhi; -#endif -} - -} // namespace - namespace Common { u64 EstimateRDTSCFrequency() { -- cgit v1.2.3 From 592a64991873273679fb9315775581c4a2fe5815 Mon Sep 17 00:00:00 2001 From: bunnei Date: Mon, 15 Feb 2021 14:51:43 -0800 Subject: common: wall_clock: Optimize GetClockCycles/GetCPUCycles to use a single MUL instruction. --- src/common/wall_clock.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/common/wall_clock.cpp b/src/common/wall_clock.cpp index a8c143f85..1545993bd 100644 --- a/src/common/wall_clock.cpp +++ b/src/common/wall_clock.cpp @@ -2,6 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include + #include "common/uint128.h" #include "common/wall_clock.h" @@ -18,7 +20,9 @@ using base_time_point = std::chrono::time_point; class StandardWallClock final : public WallClock { public: explicit StandardWallClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_) - : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, false) { + : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, false), + emulated_clock_factor{GetFixedPoint64Factor(emulated_clock_frequency, 1000000000)}, + emulated_cpu_factor{GetFixedPoint64Factor(emulated_cpu_frequency, 1000000000)} { start_time = base_timer::now(); } @@ -41,16 +45,11 @@ public: } u64 GetClockCycles() override { - std::chrono::nanoseconds time_now = GetTimeNS(); - const u128 temporary = - Common::Multiply64Into128(time_now.count(), emulated_clock_frequency); - return Common::Divide128On32(temporary, 1000000000).first; + return MultiplyHigh(GetTimeNS().count(), emulated_clock_factor); } u64 GetCPUCycles() override { - std::chrono::nanoseconds time_now = GetTimeNS(); - const u128 temporary = Common::Multiply64Into128(time_now.count(), emulated_cpu_frequency); - return Common::Divide128On32(temporary, 1000000000).first; + return MultiplyHigh(GetTimeNS().count(), emulated_cpu_factor); } void Pause([[maybe_unused]] bool is_paused) override { @@ -59,6 +58,8 @@ public: private: base_time_point start_time; + const u64 emulated_clock_factor; + const u64 emulated_cpu_factor; }; #ifdef ARCHITECTURE_x86_64 -- cgit v1.2.3 From f3345e84ad3d1a771eec36d30de7717fcae7e63b Mon Sep 17 00:00:00 2001 From: bunnei Date: Mon, 15 Feb 2021 14:54:06 -0800 Subject: core: core_timing_util: Optimize core timing math. - Avoids a lot of unnecessary 128-bit math for imperceptible accuracy. --- src/core/CMakeLists.txt | 1 - src/core/core_timing_util.cpp | 84 ------------------------------------------- src/core/core_timing_util.h | 61 ++++++++++++++++++++++++------- 3 files changed, 48 insertions(+), 98 deletions(-) delete mode 100644 src/core/core_timing_util.cpp diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 28196d26a..c6bdf72ec 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -19,7 +19,6 @@ add_library(core STATIC core.h core_timing.cpp core_timing.h - core_timing_util.cpp core_timing_util.h cpu_manager.cpp cpu_manager.h diff --git a/src/core/core_timing_util.cpp b/src/core/core_timing_util.cpp deleted file mode 100644 index 8ce8e602e..000000000 --- a/src/core/core_timing_util.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project -// Licensed under GPLv2+ -// Refer to the license.txt file included. - -#include "core/core_timing_util.h" - -#include -#include -#include "common/logging/log.h" -#include "common/uint128.h" -#include "core/hardware_properties.h" - -namespace Core::Timing { - -constexpr u64 MAX_VALUE_TO_MULTIPLY = std::numeric_limits::max() / Hardware::BASE_CLOCK_RATE; - -s64 msToCycles(std::chrono::milliseconds ms) { - if (static_cast(ms.count() / 1000) > MAX_VALUE_TO_MULTIPLY) { - LOG_ERROR(Core_Timing, "Integer overflow, use max value"); - return std::numeric_limits::max(); - } - if (static_cast(ms.count()) > MAX_VALUE_TO_MULTIPLY) { - LOG_DEBUG(Core_Timing, "Time very big, do rounding"); - return Hardware::BASE_CLOCK_RATE * (ms.count() / 1000); - } - return (Hardware::BASE_CLOCK_RATE * ms.count()) / 1000; -} - -s64 usToCycles(std::chrono::microseconds us) { - if (static_cast(us.count() / 1000000) > MAX_VALUE_TO_MULTIPLY) { - LOG_ERROR(Core_Timing, "Integer overflow, use max value"); - return std::numeric_limits::max(); - } - if (static_cast(us.count()) > MAX_VALUE_TO_MULTIPLY) { - LOG_DEBUG(Core_Timing, "Time very big, do rounding"); - return Hardware::BASE_CLOCK_RATE * (us.count() / 1000000); - } - return (Hardware::BASE_CLOCK_RATE * us.count()) / 1000000; -} - -s64 nsToCycles(std::chrono::nanoseconds ns) { - const u128 temporal = Common::Multiply64Into128(ns.count(), Hardware::BASE_CLOCK_RATE); - return Common::Divide128On32(temporal, static_cast(1000000000)).first; -} - -u64 msToClockCycles(std::chrono::milliseconds ns) { - const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ); - return Common::Divide128On32(temp, 1000).first; -} - -u64 usToClockCycles(std::chrono::microseconds ns) { - const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ); - return Common::Divide128On32(temp, 1000000).first; -} - -u64 nsToClockCycles(std::chrono::nanoseconds ns) { - const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ); - return Common::Divide128On32(temp, 1000000000).first; -} - -u64 CpuCyclesToClockCycles(u64 ticks) { - const u128 temporal = Common::Multiply64Into128(ticks, Hardware::CNTFREQ); - return Common::Divide128On32(temporal, static_cast(Hardware::BASE_CLOCK_RATE)).first; -} - -std::chrono::milliseconds CyclesToMs(s64 cycles) { - const u128 temporal = Common::Multiply64Into128(cycles, 1000); - u64 ms = Common::Divide128On32(temporal, static_cast(Hardware::BASE_CLOCK_RATE)).first; - return std::chrono::milliseconds(ms); -} - -std::chrono::nanoseconds CyclesToNs(s64 cycles) { - const u128 temporal = Common::Multiply64Into128(cycles, 1000000000); - u64 ns = Common::Divide128On32(temporal, static_cast(Hardware::BASE_CLOCK_RATE)).first; - return std::chrono::nanoseconds(ns); -} - -std::chrono::microseconds CyclesToUs(s64 cycles) { - const u128 temporal = Common::Multiply64Into128(cycles, 1000000); - u64 us = Common::Divide128On32(temporal, static_cast(Hardware::BASE_CLOCK_RATE)).first; - return std::chrono::microseconds(us); -} - -} // namespace Core::Timing diff --git a/src/core/core_timing_util.h b/src/core/core_timing_util.h index e4a046bf9..14c36a485 100644 --- a/src/core/core_timing_util.h +++ b/src/core/core_timing_util.h @@ -1,24 +1,59 @@ -// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project -// Licensed under GPLv2+ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once #include + #include "common/common_types.h" +#include "core/hardware_properties.h" namespace Core::Timing { -s64 msToCycles(std::chrono::milliseconds ms); -s64 usToCycles(std::chrono::microseconds us); -s64 nsToCycles(std::chrono::nanoseconds ns); -u64 msToClockCycles(std::chrono::milliseconds ns); -u64 usToClockCycles(std::chrono::microseconds ns); -u64 nsToClockCycles(std::chrono::nanoseconds ns); -std::chrono::milliseconds CyclesToMs(s64 cycles); -std::chrono::nanoseconds CyclesToNs(s64 cycles); -std::chrono::microseconds CyclesToUs(s64 cycles); - -u64 CpuCyclesToClockCycles(u64 ticks); +namespace detail { +constexpr u64 CNTFREQ_ADJUSTED = Hardware::CNTFREQ / 1000; +constexpr u64 BASE_CLOCK_RATE_ADJUSTED = Hardware::BASE_CLOCK_RATE / 1000; +} // namespace detail + +[[nodiscard]] constexpr s64 msToCycles(std::chrono::milliseconds ms) { + return ms.count() * detail::BASE_CLOCK_RATE_ADJUSTED; +} + +[[nodiscard]] constexpr s64 usToCycles(std::chrono::microseconds us) { + return us.count() * detail::BASE_CLOCK_RATE_ADJUSTED / 1000; +} + +[[nodiscard]] constexpr s64 nsToCycles(std::chrono::nanoseconds ns) { + return ns.count() * detail::BASE_CLOCK_RATE_ADJUSTED / 1000000; +} + +[[nodiscard]] constexpr u64 msToClockCycles(std::chrono::milliseconds ms) { + return static_cast(ms.count()) * detail::CNTFREQ_ADJUSTED; +} + +[[nodiscard]] constexpr u64 usToClockCycles(std::chrono::microseconds us) { + return us.count() * detail::CNTFREQ_ADJUSTED / 1000; +} + +[[nodiscard]] constexpr u64 nsToClockCycles(std::chrono::nanoseconds ns) { + return ns.count() * detail::CNTFREQ_ADJUSTED / 1000000; +} + +[[nodiscard]] constexpr u64 CpuCyclesToClockCycles(u64 ticks) { + return ticks * detail::CNTFREQ_ADJUSTED / detail::BASE_CLOCK_RATE_ADJUSTED; +} + +[[nodiscard]] constexpr std::chrono::milliseconds CyclesToMs(s64 cycles) { + return std::chrono::milliseconds(cycles / detail::BASE_CLOCK_RATE_ADJUSTED); +} + +[[nodiscard]] constexpr std::chrono::nanoseconds CyclesToNs(s64 cycles) { + return std::chrono::nanoseconds(cycles * 1000000 / detail::BASE_CLOCK_RATE_ADJUSTED); +} + +[[nodiscard]] constexpr std::chrono::microseconds CyclesToUs(s64 cycles) { + return std::chrono::microseconds(cycles * 1000 / detail::BASE_CLOCK_RATE_ADJUSTED); +} } // namespace Core::Timing -- cgit v1.2.3