From 82c2601555b59a94d7160f2fd686cb63d32dd423 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 16 Jan 2021 20:48:58 -0300 Subject: video_core: Reimplement the buffer cache Reimplement the buffer cache using cached bindings and page level granularity for modification tracking. This also drops the usage of shared pointers and virtual functions from the cache. - Bindings are cached, allowing to skip work when the game changes few bits between draws. - OpenGL Assembly shaders no longer copy when a region has been modified from the GPU to emulate constant buffers, instead GL_EXT_memory_object is used to alias sub-buffers within the same allocation. - OpenGL Assembly shaders stream constant buffer data using glProgramBufferParametersIuivNV, from NV_parameter_buffer_object. In theory this should save one hash table resolve inside the driver compared to glBufferSubData. - A new OpenGL stream buffer is implemented based on fences for drivers that are not Nvidia's proprietary, due to their low performance on partial glBufferSubData calls synchronized with 3D rendering (that some games use a lot). - Most optimizations are shared between APIs now, allowing Vulkan to cache more bindings than before, skipping unnecesarry work. This commit adds the necessary infrastructure to use Vulkan object from OpenGL. Overall, it improves performance and fixes some bugs present on the old cache. There are still some edge cases hit by some games that harm performance on some vendors, this are planned to be fixed in later commits. --- src/video_core/buffer_cache/buffer_block.h | 62 - src/video_core/buffer_cache/buffer_cache.cpp | 13 + src/video_core/buffer_cache/buffer_cache.h | 1598 ++++++++++++++++++-------- src/video_core/buffer_cache/map_interval.cpp | 33 - src/video_core/buffer_cache/map_interval.h | 93 -- 5 files changed, 1132 insertions(+), 667 deletions(-) delete mode 100644 src/video_core/buffer_cache/buffer_block.h create mode 100644 src/video_core/buffer_cache/buffer_cache.cpp delete mode 100644 src/video_core/buffer_cache/map_interval.cpp delete mode 100644 src/video_core/buffer_cache/map_interval.h (limited to 'src/video_core/buffer_cache') diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h deleted file mode 100644 index e9306194a..000000000 --- a/src/video_core/buffer_cache/buffer_block.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "common/common_types.h" - -namespace VideoCommon { - -class BufferBlock { -public: - [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const { - return (cpu_addr < end) && (cpu_addr_end > start); - } - - [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const { - return cpu_addr <= other_start && other_end <= cpu_addr_end; - } - - [[nodiscard]] std::size_t Offset(VAddr in_addr) const { - return static_cast(in_addr - cpu_addr); - } - - [[nodiscard]] VAddr CpuAddr() const { - return cpu_addr; - } - - [[nodiscard]] VAddr CpuAddrEnd() const { - return cpu_addr_end; - } - - void SetCpuAddr(VAddr new_addr) { - cpu_addr = new_addr; - cpu_addr_end = new_addr + size; - } - - [[nodiscard]] std::size_t Size() const { - return size; - } - - [[nodiscard]] u64 Epoch() const { - return epoch; - } - - void SetEpoch(u64 new_epoch) { - epoch = new_epoch; - } - -protected: - explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} { - SetCpuAddr(cpu_addr_); - } - -private: - VAddr cpu_addr{}; - VAddr cpu_addr_end{}; - std::size_t size{}; - u64 epoch{}; -}; - -} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp new file mode 100644 index 000000000..ab32294c8 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -0,0 +1,13 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/microprofile.h" + +namespace VideoCommon { + +MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128)); +MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128)); +MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128)); + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 83b9ee871..e4f3c8e35 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -4,591 +4,1231 @@ #pragma once -#include +#include +#include +#include #include #include +#include #include -#include -#include #include #include -#include -#include -#include "common/alignment.h" -#include "common/assert.h" #include "common/common_types.h" -#include "common/logging/log.h" -#include "core/core.h" +#include "common/div_ceil.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" #include "core/memory.h" #include "core/settings.h" -#include "video_core/buffer_cache/buffer_block.h" -#include "video_core/buffer_cache/map_interval.h" +#include "video_core/buffer_cache/buffer_base.h" +#include "video_core/delayed_destruction_ring.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" +#include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/types.h" namespace VideoCommon { -template +MICROPROFILE_DECLARE(GPU_PrepareBuffers); +MICROPROFILE_DECLARE(GPU_BindUploadBuffers); +MICROPROFILE_DECLARE(GPU_DownloadMemory); + +using BufferId = SlotId; + +constexpr u32 NUM_VERTEX_BUFFERS = 32; +constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; +constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; +constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; +constexpr u32 NUM_STORAGE_BUFFERS = 16; +constexpr u32 NUM_STAGES = 5; + +template class BufferCache { - using IntervalSet = boost::icl::interval_set; - using IntervalType = typename IntervalSet::interval_type; - using VectorMapInterval = boost::container::small_vector; + // Page size for caching purposes. + // This is unrelated to the CPU page size and it can be changed as it seems optimal. + static constexpr u32 PAGE_BITS = 16; + static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; - static constexpr u64 WRITE_PAGE_BIT = 11; - static constexpr u64 BLOCK_PAGE_BITS = 21; - static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; + static constexpr bool IS_OPENGL = P::IS_OPENGL; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = + P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = + P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; + static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; -public: - struct BufferInfo { - BufferType handle; - u64 offset; - u64 address; + static constexpr BufferId NULL_BUFFER_ID{0}; + + using Maxwell = Tegra::Engines::Maxwell3D::Regs; + + using Runtime = typename P::Runtime; + using Buffer = typename P::Buffer; + + struct Empty {}; + + struct Binding { + VAddr cpu_addr{}; + u32 size{}; + BufferId buffer_id; }; - BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, - bool is_written = false, bool use_fast_cbuf = false) { - std::lock_guard lock{mutex}; + static constexpr Binding NULL_BINDING{ + .cpu_addr = 0, + .size = 0, + .buffer_id = NULL_BUFFER_ID, + }; - const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - return GetEmptyBuffer(size); - } +public: + static constexpr size_t SKIP_CACHE_SIZE = 4096; - // Cache management is a big overhead, so only cache entries with a given size. - // TODO: Figure out which size is the best for given games. - constexpr std::size_t max_stream_size = 0x800; - if (use_fast_cbuf || size < max_stream_size) { - if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) { - const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size); - if (use_fast_cbuf) { - u8* dest; - if (is_granular) { - dest = gpu_memory.GetPointer(gpu_addr); - } else { - staging_buffer.resize(size); - dest = staging_buffer.data(); - gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); - } - return ConstBufferUpload(dest, size); - } - if (is_granular) { - u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); - return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) { - std::memcpy(dest, host_ptr, size); - }); - } else { - return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) { - gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); - }); - } - } - } + explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + Runtime& runtime_); - Buffer* const block = GetBlock(*cpu_addr, size); - MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size); - if (!map) { - return GetEmptyBuffer(size); - } - if (is_written) { - map->MarkAsModified(true, GetModifiedTicks()); - if (Settings::IsGPULevelHigh() && - Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - MarkForAsyncFlush(map); - } - if (!map->is_written) { - map->is_written = true; - MarkRegionAsWritten(map->start, map->end - 1); - } - } + void TickFrame(); - return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()}; - } + void WriteMemory(VAddr cpu_addr, u64 size); - /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. - BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, - std::size_t alignment = 4) { - std::lock_guard lock{mutex}; - return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) { - std::memcpy(dest, raw_pointer, size); - }); - } + void CachedWriteMemory(VAddr cpu_addr, u64 size); - /// Prepares the buffer cache for data uploading - /// @param max_size Maximum number of bytes that will be uploaded - /// @return True when a stream buffer invalidation was required, false otherwise - void Map(std::size_t max_size) { - std::lock_guard lock{mutex}; + void DownloadMemory(VAddr cpu_addr, u64 size); - std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4); - buffer_offset = buffer_offset_base; - } + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); - /// Finishes the upload stream - void Unmap() { - std::lock_guard lock{mutex}; - stream_buffer.Unmap(buffer_offset - buffer_offset_base); - } + void UpdateGraphicsBuffers(bool is_indexed); - /// Function called at the end of each frame, inteded for deferred operations - void TickFrame() { - ++epoch; + void UpdateComputeBuffers(); - while (!pending_destruction.empty()) { - // Delay at least 4 frames before destruction. - // This is due to triple buffering happening on some drivers. - static constexpr u64 epochs_to_destroy = 5; - if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) { - break; - } - pending_destruction.pop(); - } - } + void BindHostGeometryBuffers(bool is_indexed); - /// Write any cached resources overlapping the specified region back to memory - void FlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + void BindHostStageBuffers(size_t stage); - VectorMapInterval objects = GetMapsInRange(addr, size); - std::sort(objects.begin(), objects.end(), - [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; }); - for (MapInterval* object : objects) { - if (object->is_modified && object->is_registered) { - mutex.unlock(); - FlushMap(object); - mutex.lock(); - } - } - } + void BindHostComputeBuffers(); - bool MustFlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + void SetEnabledUniformBuffers(size_t stage, u32 enabled); - const VectorMapInterval objects = GetMapsInRange(addr, size); - return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) { - return map->is_modified && map->is_registered; - }); - } + void SetEnabledComputeUniformBuffers(u32 enabled); - /// Mark the specified region as being invalidated - void InvalidateRegion(VAddr addr, u64 size) { - std::lock_guard lock{mutex}; + void UnbindGraphicsStorageBuffers(size_t stage); - for (auto& object : GetMapsInRange(addr, size)) { - if (object->is_registered) { - Unregister(object); - } - } - } + void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); - void OnCPUWrite(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + void UnbindComputeStorageBuffers(); - for (MapInterval* object : GetMapsInRange(addr, size)) { - if (object->is_memory_marked && object->is_registered) { - UnmarkMemory(object); - object->is_sync_pending = true; - marked_for_unregister.emplace_back(object); - } - } - } + void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); - void SyncGuestHost() { - std::lock_guard lock{mutex}; + void FlushCachedWrites(); - for (auto& object : marked_for_unregister) { - if (object->is_registered) { - object->is_sync_pending = false; - Unregister(object); - } + /// Return true when there are uncommitted buffers to be downloaded + [[nodiscard]] bool HasUncommittedFlushes() const noexcept; + + /// Return true when the caller should wait for async downloads + [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; + + /// Commit asynchronous downloads + void CommitAsyncFlushes(); + + /// Pop asynchronous downloads + void PopAsyncFlushes(); + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + + std::mutex mutex; + +private: + template + static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { + for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { + const int disabled_bits = std::countr_zero(enabled_mask); + index += disabled_bits; + enabled_mask >>= disabled_bits; + func(index); } - marked_for_unregister.clear(); } - void CommitAsyncFlushes() { - if (uncommitted_flushes) { - auto commit_list = std::make_shared>(); - for (MapInterval* map : *uncommitted_flushes) { - if (map->is_registered && map->is_modified) { - // TODO(Blinkhawk): Implement backend asynchronous flushing - // AsyncFlushMap(map) - commit_list->push_back(map); - } - } - if (!commit_list->empty()) { - committed_flushes.push_back(commit_list); - } else { - committed_flushes.emplace_back(); + template + void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { + const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); + for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + ++page; + continue; } - } else { - committed_flushes.emplace_back(); + Buffer& buffer = slot_buffers[buffer_id]; + func(buffer_id, buffer); + + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, PAGE_SIZE); } - uncommitted_flushes.reset(); } - bool ShouldWaitAsyncFlushes() const { - return !committed_flushes.empty() && committed_flushes.front() != nullptr; + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { + return (cpu_addr & ~Core::Memory::PAGE_MASK) == + ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); } - bool HasUncommittedFlushes() const { - return uncommitted_flushes != nullptr; - } + void BindHostIndexBuffer(); - void PopAsyncFlushes() { - if (committed_flushes.empty()) { - return; - } - auto& flush_list = committed_flushes.front(); - if (!flush_list) { - committed_flushes.pop_front(); - return; - } - for (MapInterval* map : *flush_list) { - if (map->is_registered) { - // TODO(Blinkhawk): Replace this for reading the asynchronous flush - FlushMap(map); - } - } - committed_flushes.pop_front(); - } + void BindHostVertexBuffers(); - virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; + void BindHostGraphicsUniformBuffers(size_t stage); -protected: - explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, - Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, - StreamBuffer& stream_buffer_) - : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, - stream_buffer{stream_buffer_} {} + void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); - ~BufferCache() = default; + void BindHostGraphicsStorageBuffers(size_t stage); - virtual std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) = 0; + void BindHostTransformFeedbackBuffers(); - virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { - return {}; - } + void BindHostComputeUniformBuffers(); - /// Register an object into the cache - MapInterval* Register(MapInterval new_map, bool inherit_written = false) { - const VAddr cpu_addr = new_map.start; - if (!cpu_addr) { - LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}", - new_map.gpu_addr); - return nullptr; - } - const std::size_t size = new_map.end - new_map.start; - new_map.is_registered = true; - rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); - new_map.is_memory_marked = true; - if (inherit_written) { - MarkRegionAsWritten(new_map.start, new_map.end - 1); - new_map.is_written = true; - } - MapInterval* const storage = mapped_addresses_allocator.Allocate(); - *storage = new_map; - mapped_addresses.insert(*storage); - return storage; - } + void BindHostComputeStorageBuffers(); - void UnmarkMemory(MapInterval* map) { - if (!map->is_memory_marked) { - return; - } - const std::size_t size = map->end - map->start; - rasterizer.UpdatePagesCachedCount(map->start, size, -1); - map->is_memory_marked = false; - } - - /// Unregisters an object from the cache - void Unregister(MapInterval* map) { - UnmarkMemory(map); - map->is_registered = false; - if (map->is_sync_pending) { - map->is_sync_pending = false; - marked_for_unregister.remove(map); + void DoUpdateGraphicsBuffers(bool is_indexed); + + void DoUpdateComputeBuffers(); + + void UpdateIndexBuffer(); + + void UpdateVertexBuffers(); + + void UpdateVertexBuffer(u32 index); + + void UpdateUniformBuffers(size_t stage); + + void UpdateStorageBuffers(size_t stage); + + void UpdateTransformFeedbackBuffers(); + + void UpdateTransformFeedbackBuffer(u32 index); + + void UpdateComputeUniformBuffers(); + + void UpdateComputeStorageBuffers(); + + void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); + + [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); + + [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); + + void Register(BufferId buffer_id); + + void Unregister(BufferId buffer_id); + + template + void ChangeRegister(BufferId buffer_id); + + void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); + + void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); + + void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span copies); + + void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span copies); + + void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, + std::span copies); + + void DeleteBuffer(BufferId buffer_id); + + void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); + + void NotifyBufferDeletion(); + + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; + + [[nodiscard]] std::span ImmediateBufferWithData(VAddr cpu_addr, size_t size); + + [[nodiscard]] std::span ImmediateBuffer(size_t wanted_capacity); + + [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; + + VideoCore::RasterizerInterface& rasterizer; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + Tegra::MemoryManager& gpu_memory; + Core::Memory::Memory& cpu_memory; + Runtime& runtime; + + SlotVector slot_buffers; + DelayedDestructionRing delayed_destruction_ring; + + u32 last_index_count = 0; + + Binding index_buffer; + std::array vertex_buffers; + std::array, NUM_STAGES> uniform_buffers; + std::array, NUM_STAGES> storage_buffers; + std::array transform_feedback_buffers; + + std::array compute_uniform_buffers; + std::array compute_storage_buffers; + + std::array enabled_uniform_buffers{}; + u32 enabled_compute_uniform_buffers = 0; + + std::array enabled_storage_buffers{}; + std::array written_storage_buffers{}; + u32 enabled_compute_storage_buffers = 0; + u32 written_compute_storage_buffers = 0; + + std::array fast_bound_uniform_buffers{}; + + bool has_deleted_buffers = false; + + std::conditional_t, Empty> + dirty_uniform_buffers{}; + + std::vector cached_write_buffer_ids; + + // TODO: This data structure is not optimal and it should be reworked + std::vector uncommitted_downloads; + std::deque> committed_downloads; + + size_t immediate_buffer_capacity = 0; + std::unique_ptr immediate_buffer_alloc; + + std::array> PAGE_BITS)> page_table; +}; + +template +BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + Runtime& runtime_) + : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, + gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { + // Ensure the first slot is used for the null buffer + void(slot_buffers.insert(runtime, NullBufferParams{})); +} + +template +void BufferCache

::TickFrame() { + delayed_destruction_ring.Tick(); +} + +template +void BufferCache

::WriteMemory(VAddr cpu_addr, u64 size) { + ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { + buffer.MarkRegionAsCpuModified(cpu_addr, size); + }); +} + +template +void BufferCache

::CachedWriteMemory(VAddr cpu_addr, u64 size) { + ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { + if (!buffer.HasCachedWrites()) { + cached_write_buffer_ids.push_back(buffer_id); } - if (map->is_written) { - UnmarkRegionAsWritten(map->start, map->end - 1); + buffer.CachedCpuWrite(cpu_addr, size); + }); +} + +template +void BufferCache

::DownloadMemory(VAddr cpu_addr, u64 size) { + ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = range_offset, + .dst_offset = total_size_bytes, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + if (total_size_bytes == 0) { + return; } - const auto it = mapped_addresses.find(*map); - ASSERT(it != mapped_addresses.end()); - mapped_addresses.erase(it); - mapped_addresses_allocator.Release(map); - } - -private: - MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { - const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); - if (overlaps.empty()) { - const VAddr cpu_addr_end = cpu_addr + size; - if (gpu_memory.IsGranularRange(gpu_addr, size)) { - u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); - block->Upload(block->Offset(cpu_addr), size, host_ptr); - } else { - staging_buffer.resize(size); - gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - block->Upload(block->Offset(cpu_addr), size, staging_buffer.data()); + MICROPROFILE_SCOPE(GPU_DownloadMemory); + + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + const u8* const mapped_memory = download_staging.mapped_span.data(); + const std::span copies_span(copies.data(), copies.data() + copies.size()); + runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); + runtime.Finish(); + for (const BufferCopy& copy : copies) { + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + const u8* copy_mapped_memory = mapped_memory + copy.dst_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); } - return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); - } - - const VAddr cpu_addr_end = cpu_addr + size; - if (overlaps.size() == 1) { - MapInterval* const current_map = overlaps[0]; - if (current_map->IsInside(cpu_addr, cpu_addr_end)) { - return current_map; + } else { + const std::span immediate_buffer = ImmediateBuffer(largest_copy); + for (const BufferCopy& copy : copies) { + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); } } - VAddr new_start = cpu_addr; - VAddr new_end = cpu_addr_end; - bool write_inheritance = false; - bool modified_inheritance = false; - // Calculate new buffer parameters - for (MapInterval* overlap : overlaps) { - new_start = std::min(overlap->start, new_start); - new_end = std::max(overlap->end, new_end); - write_inheritance |= overlap->is_written; - modified_inheritance |= overlap->is_modified; + }); +} + +template +void BufferCache

::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) { + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + uniform_buffers[stage][index] = NULL_BINDING; + return; + } + const Binding binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = BufferId{}, + }; + uniform_buffers[stage][index] = binding; +} + +template +void BufferCache

::UpdateGraphicsBuffers(bool is_indexed) { + MICROPROFILE_SCOPE(GPU_PrepareBuffers); + do { + has_deleted_buffers = false; + DoUpdateGraphicsBuffers(is_indexed); + } while (has_deleted_buffers); +} + +template +void BufferCache

::UpdateComputeBuffers() { + MICROPROFILE_SCOPE(GPU_PrepareBuffers); + do { + has_deleted_buffers = false; + DoUpdateComputeBuffers(); + } while (has_deleted_buffers); +} + +template +void BufferCache

::BindHostGeometryBuffers(bool is_indexed) { + MICROPROFILE_SCOPE(GPU_BindUploadBuffers); + if (is_indexed) { + BindHostIndexBuffer(); + } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { + const auto& regs = maxwell3d.regs; + if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) { + runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count); } - GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; - for (auto& overlap : overlaps) { - Unregister(overlap); + } + BindHostVertexBuffers(); + BindHostTransformFeedbackBuffers(); +} + +template +void BufferCache

::BindHostStageBuffers(size_t stage) { + MICROPROFILE_SCOPE(GPU_BindUploadBuffers); + BindHostGraphicsUniformBuffers(stage); + BindHostGraphicsStorageBuffers(stage); +} + +template +void BufferCache

::BindHostComputeBuffers() { + MICROPROFILE_SCOPE(GPU_BindUploadBuffers); + BindHostComputeUniformBuffers(); + BindHostComputeStorageBuffers(); +} + +template +void BufferCache

::SetEnabledUniformBuffers(size_t stage, u32 enabled) { + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + if (enabled_uniform_buffers[stage] != enabled) { + dirty_uniform_buffers[stage] = ~u32{0}; } - UpdateBlock(block, new_start, new_end, overlaps); - - const MapInterval new_map{new_start, new_end, new_gpu_addr}; - MapInterval* const map = Register(new_map, write_inheritance); - if (!map) { - return nullptr; + } + enabled_uniform_buffers[stage] = enabled; +} + +template +void BufferCache

::SetEnabledComputeUniformBuffers(u32 enabled) { + enabled_compute_uniform_buffers = enabled; +} + +template +void BufferCache

::UnbindGraphicsStorageBuffers(size_t stage) { + enabled_storage_buffers[stage] = 0; + written_storage_buffers[stage] = 0; +} + +template +void BufferCache

::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, + u32 cbuf_offset, bool is_written) { + enabled_storage_buffers[stage] |= 1U << ssbo_index; + written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index; + + const auto& cbufs = maxwell3d.state.shader_stages[stage]; + const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset; + storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr); +} + +template +void BufferCache

::UnbindComputeStorageBuffers() { + enabled_compute_storage_buffers = 0; + written_compute_storage_buffers = 0; +} + +template +void BufferCache

::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written) { + enabled_compute_storage_buffers |= 1U << ssbo_index; + written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index; + + const auto& launch_desc = kepler_compute.launch_description; + ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0); + + const auto& cbufs = launch_desc.const_buffer_config; + const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset; + compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr); +} + +template +void BufferCache

::FlushCachedWrites() { + for (const BufferId buffer_id : cached_write_buffer_ids) { + slot_buffers[buffer_id].FlushCachedWrites(); + } + cached_write_buffer_ids.clear(); +} + +template +bool BufferCache

::HasUncommittedFlushes() const noexcept { + return !uncommitted_downloads.empty(); +} + +template +bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { + return !committed_downloads.empty() && !committed_downloads.front().empty(); +} + +template +void BufferCache

::CommitAsyncFlushes() { + // This is intentionally passing the value by copy + committed_downloads.push_front(uncommitted_downloads); + uncommitted_downloads.clear(); +} + +template +void BufferCache

::PopAsyncFlushes() { + if (committed_downloads.empty()) { + return; + } + auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); }); + const std::span download_ids = committed_downloads.back(); + if (download_ids.empty()) { + return; + } + MICROPROFILE_SCOPE(GPU_DownloadMemory); + + boost::container::small_vector, 1> downloads; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + for (const BufferId buffer_id : download_ids) { + slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) { + downloads.push_back({ + BufferCopy{ + .src_offset = range_offset, + .dst_offset = total_size_bytes, + .size = range_size, + }, + buffer_id, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + } + if (downloads.empty()) { + return; + } + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + for (const auto [copy, buffer_id] : downloads) { + const std::array copies{copy}; + runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies); } - if (modified_inheritance) { - map->MarkAsModified(true, GetModifiedTicks()); - if (Settings::IsGPULevelHigh() && - Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - MarkForAsyncFlush(map); - } + runtime.Finish(); + for (const auto [copy, buffer_id] : downloads) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + } + } else { + const std::span immediate_buffer = ImmediateBuffer(largest_copy); + for (const auto [copy, buffer_id] : downloads) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); } - return map; } - - void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { - const IntervalType base_interval{start, end}; - IntervalSet interval_set{}; - interval_set.add(base_interval); - for (auto& overlap : overlaps) { - const IntervalType subtract{overlap->start, overlap->end}; - interval_set.subtract(subtract); +} + +template +bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { + const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); + for (u64 page = addr >> PAGE_BITS; page < page_end;) { + const BufferId image_id = page_table[page]; + if (!image_id) { + ++page; + continue; } - for (auto& interval : interval_set) { - const std::size_t size = interval.upper() - interval.lower(); - if (size == 0) { - continue; - } - staging_buffer.resize(size); - cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); - block->Upload(block->Offset(interval.lower()), size, staging_buffer.data()); + Buffer& buffer = slot_buffers[image_id]; + if (buffer.IsRegionGpuModified(addr, size)) { + return true; } + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, PAGE_SIZE); } - - VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { - VectorMapInterval result; - if (size == 0) { - return result; + return false; +} + +template +void BufferCache

::BindHostIndexBuffer() { + Buffer& buffer = slot_buffers[index_buffer.buffer_id]; + const u32 offset = buffer.Offset(index_buffer.cpu_addr); + const u32 size = index_buffer.size; + SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); + if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { + runtime.BindIndexBuffer(buffer, offset, size); + } else { + runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format, + maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count, + buffer, offset, size); + } +} + +template +void BufferCache

::BindHostVertexBuffers() { + auto& flags = maxwell3d.dirty.flags; + for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { + const Binding& binding = vertex_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); + if (!flags[Dirty::VertexBuffer0 + index]) { + continue; } + flags[Dirty::VertexBuffer0 + index] = false; + + const u32 stride = maxwell3d.regs.vertex_array[index].stride; + const u32 offset = buffer.Offset(binding.cpu_addr); + runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride); + } +} - const VAddr addr_end = addr + size; - auto it = mapped_addresses.lower_bound(addr); - if (it != mapped_addresses.begin()) { - --it; +template +void BufferCache

::BindHostGraphicsUniformBuffers(size_t stage) { + u32 dirty = ~0U; + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + dirty = std::exchange(dirty_uniform_buffers[stage], 0); + } + u32 binding_index = 0; + ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { + const bool needs_bind = ((dirty >> index) & 1) != 0; + BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind); + if constexpr (NEEDS_BIND_UNIFORM_INDEX) { + ++binding_index; } - while (it != mapped_addresses.end() && it->start < addr_end) { - if (it->Overlaps(addr, addr_end)) { - result.push_back(&*it); + }); +} + +template +void BufferCache

::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, + bool needs_bind) { + const Binding& binding = uniform_buffers[stage][index]; + const VAddr cpu_addr = binding.cpu_addr; + const u32 size = binding.size; + Buffer& buffer = slot_buffers[binding.buffer_id]; + if constexpr (IS_OPENGL) { + if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) { + if (runtime.HasFastBufferSubData()) { + // Fast path for Nvidia + if (!HasFastUniformBufferBound(stage, binding_index)) { + // We only have to bind when the currently bound buffer is not the fast version + fast_bound_uniform_buffers[stage] |= 1U << binding_index; + runtime.BindFastUniformBuffer(stage, binding_index, size); + } + const auto span = ImmediateBufferWithData(cpu_addr, size); + runtime.PushFastUniformBuffer(stage, binding_index, span); + } else { + // Stream buffer path to avoid stalling on non-Nvidia drivers + const auto span = runtime.BindMappedUniformBuffer(stage, binding_index, size); + cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); } - ++it; + return; } - return result; } - - /// Returns a ticks counter used for tracking when cached objects were last modified - u64 GetModifiedTicks() { - return ++modified_ticks; + // Classic cached path + SynchronizeBuffer(buffer, cpu_addr, size); + if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) { + // Skip binding if it's not needed and if the bound buffer is not the fast version + // This exists to avoid instances where the fast buffer is bound and a GPU write happens + return; } + fast_bound_uniform_buffers[stage] &= ~(1U << binding_index); - void FlushMap(MapInterval* map) { - const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); - ASSERT_OR_EXECUTE(it != blocks.end(), return;); - - std::shared_ptr block = it->second; - - const std::size_t size = map->end - map->start; - staging_buffer.resize(size); - block->Download(block->Offset(map->start), size, staging_buffer.data()); - cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size); - map->MarkAsModified(false, 0); + const u32 offset = buffer.Offset(cpu_addr); + if constexpr (NEEDS_BIND_UNIFORM_INDEX) { + runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size); + } else { + runtime.BindUniformBuffer(buffer, offset, size); } +} + +template +void BufferCache

::BindHostGraphicsStorageBuffers(size_t stage) { + u32 binding_index = 0; + ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { + const Binding& binding = storage_buffers[stage][index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0; + if constexpr (NEEDS_BIND_STORAGE_INDEX) { + runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written); + ++binding_index; + } else { + runtime.BindStorageBuffer(buffer, offset, size, is_written); + } + }); +} - template - BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { - AlignBuffer(alignment); - const std::size_t uploaded_offset = buffer_offset; - callable(buffer_ptr); - - buffer_ptr += size; - buffer_offset += size; - return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()}; +template +void BufferCache

::BindHostTransformFeedbackBuffers() { + if (maxwell3d.regs.tfb_enabled == 0) { + return; } - - void AlignBuffer(std::size_t alignment) { - // Align the offset, not the mapped pointer - const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); - buffer_ptr += offset_aligned - buffer_offset; - buffer_offset = offset_aligned; + for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { + const Binding& binding = transform_feedback_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + runtime.BindTransformFeedbackBuffer(index, buffer, offset, size); } +} - std::shared_ptr EnlargeBlock(std::shared_ptr buffer) { - const std::size_t old_size = buffer->Size(); - const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; - const VAddr cpu_addr = buffer->CpuAddr(); - std::shared_ptr new_buffer = CreateBlock(cpu_addr, new_size); - new_buffer->CopyFrom(*buffer, 0, 0, old_size); - QueueDestruction(std::move(buffer)); - - const VAddr cpu_addr_end = cpu_addr + new_size - 1; - const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; - for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { - blocks.insert_or_assign(page_start, new_buffer); +template +void BufferCache

::BindHostComputeUniformBuffers() { + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + // Mark all uniform buffers as dirty + dirty_uniform_buffers.fill(~u32{0}); + } + u32 binding_index = 0; + ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { + const Binding& binding = compute_uniform_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + if constexpr (NEEDS_BIND_UNIFORM_INDEX) { + runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size); + ++binding_index; + } else { + runtime.BindUniformBuffer(buffer, offset, size); } + }); +} + +template +void BufferCache

::BindHostComputeStorageBuffers() { + u32 binding_index = 0; + ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { + const Binding& binding = compute_storage_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0; + if constexpr (NEEDS_BIND_STORAGE_INDEX) { + runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written); + ++binding_index; + } else { + runtime.BindStorageBuffer(buffer, offset, size, is_written); + } + }); +} - return new_buffer; +template +void BufferCache

::DoUpdateGraphicsBuffers(bool is_indexed) { + if (is_indexed) { + UpdateIndexBuffer(); } + UpdateVertexBuffers(); + UpdateTransformFeedbackBuffers(); + for (size_t stage = 0; stage < NUM_STAGES; ++stage) { + UpdateUniformBuffers(stage); + UpdateStorageBuffers(stage); + } +} + +template +void BufferCache

::DoUpdateComputeBuffers() { + UpdateComputeUniformBuffers(); + UpdateComputeStorageBuffers(); +} + +template +void BufferCache

::UpdateIndexBuffer() { + // We have to check for the dirty flags and index count + // The index count is currently changed without updating the dirty flags + const auto& index_array = maxwell3d.regs.index_array; + auto& flags = maxwell3d.dirty.flags; + if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) { + return; + } + flags[Dirty::IndexBuffer] = false; + last_index_count = index_array.count; + + const GPUVAddr gpu_addr_begin = index_array.StartAddress(); + const GPUVAddr gpu_addr_end = index_array.EndAddress(); + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); + const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); + const u32 draw_size = index_array.count * index_array.FormatSizeInBytes(); + const u32 size = std::min(address_size, draw_size); + if (size == 0 || !cpu_addr) { + index_buffer = NULL_BINDING; + return; + } + index_buffer = Binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = FindBuffer(*cpu_addr, size), + }; +} - std::shared_ptr MergeBlocks(std::shared_ptr first, - std::shared_ptr second) { - const std::size_t size_1 = first->Size(); - const std::size_t size_2 = second->Size(); - const VAddr first_addr = first->CpuAddr(); - const VAddr second_addr = second->CpuAddr(); - const VAddr new_addr = std::min(first_addr, second_addr); - const std::size_t new_size = size_1 + size_2; - - std::shared_ptr new_buffer = CreateBlock(new_addr, new_size); - new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1); - new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2); - QueueDestruction(std::move(first)); - QueueDestruction(std::move(second)); +template +void BufferCache

::UpdateVertexBuffers() { + auto& flags = maxwell3d.dirty.flags; + if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) { + return; + } + flags[Dirty::VertexBuffers] = false; - const VAddr cpu_addr_end = new_addr + new_size - 1; - const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; - for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { - blocks.insert_or_assign(page_start, new_buffer); - } - return new_buffer; + for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { + UpdateVertexBuffer(index); } +} - Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { - std::shared_ptr found; +template +void BufferCache

::UpdateVertexBuffer(u32 index) { + if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) { + return; + } + const auto& array = maxwell3d.regs.vertex_array[index]; + const auto& limit = maxwell3d.regs.vertex_array_limit[index]; + const GPUVAddr gpu_addr_begin = array.StartAddress(); + const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1; + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); + const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); + const u32 size = address_size; // TODO: Analyze stride and number of vertices + if (array.enable == 0 || size == 0 || !cpu_addr) { + vertex_buffers[index] = NULL_BINDING; + return; + } + vertex_buffers[index] = Binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = FindBuffer(*cpu_addr, size), + }; +} + +template +void BufferCache

::UpdateUniformBuffers(size_t stage) { + ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { + Binding& binding = uniform_buffers[stage][index]; + if (binding.buffer_id) { + // Already updated + return; + } + // Mark as dirty + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + dirty_uniform_buffers[stage] |= 1U << index; + } + // Resolve buffer + binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); + }); +} + +template +void BufferCache

::UpdateStorageBuffers(size_t stage) { + const u32 written_mask = written_storage_buffers[stage]; + ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { + // Resolve buffer + Binding& binding = storage_buffers[stage][index]; + const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); + binding.buffer_id = buffer_id; + // Mark buffer as written if needed + if (((written_mask >> index) & 1) != 0) { + MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); + } + }); +} - const VAddr cpu_addr_end = cpu_addr + size - 1; - const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; - for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { - auto it = blocks.find(page_start); - if (it == blocks.end()) { - if (found) { - found = EnlargeBlock(found); - continue; - } - const VAddr start_addr = page_start << BLOCK_PAGE_BITS; - found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); - blocks.insert_or_assign(page_start, found); - continue; - } - if (!found) { - found = it->second; - continue; - } - if (found != it->second) { - found = MergeBlocks(std::move(found), it->second); +template +void BufferCache

::UpdateTransformFeedbackBuffers() { + if (maxwell3d.regs.tfb_enabled == 0) { + return; + } + for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { + UpdateTransformFeedbackBuffer(index); + } +} + +template +void BufferCache

::UpdateTransformFeedbackBuffer(u32 index) { + const auto& binding = maxwell3d.regs.tfb_bindings[index]; + const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset; + const u32 size = binding.buffer_size; + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) { + transform_feedback_buffers[index] = NULL_BINDING; + return; + } + const BufferId buffer_id = FindBuffer(*cpu_addr, size); + transform_feedback_buffers[index] = Binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = buffer_id, + }; + MarkWrittenBuffer(buffer_id, *cpu_addr, size); +} + +template +void BufferCache

::UpdateComputeUniformBuffers() { + ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { + Binding& binding = compute_uniform_buffers[index]; + binding = NULL_BINDING; + const auto& launch_desc = kepler_compute.launch_description; + if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) { + const auto& cbuf = launch_desc.const_buffer_config[index]; + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address()); + if (cpu_addr) { + binding.cpu_addr = *cpu_addr; + binding.size = cbuf.size; } } - return found.get(); + binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); + }); +} + +template +void BufferCache

::UpdateComputeStorageBuffers() { + ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { + // Resolve buffer + Binding& binding = compute_storage_buffers[index]; + const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); + binding.buffer_id = buffer_id; + // Mark as written if needed + if (((written_compute_storage_buffers >> index) & 1) != 0) { + MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); + } + }); +} + +template +void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.MarkRegionAsGpuModified(cpu_addr, size); + + const bool is_accuracy_high = Settings::IsGPULevelHigh(); + const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); + if (!is_accuracy_high || !is_async) { + return; } + if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) { + // Already inserted + return; + } + uncommitted_downloads.push_back(buffer_id); +} - void MarkRegionAsWritten(VAddr start, VAddr end) { - const u64 page_end = end >> WRITE_PAGE_BIT; - for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { - ++it->second; - } +template +BufferId BufferCache

::FindBuffer(VAddr cpu_addr, u32 size) { + if (cpu_addr == 0) { + return NULL_BUFFER_ID; + } + const u64 page = cpu_addr >> PAGE_BITS; + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + return CreateBuffer(cpu_addr, size); + } + const Buffer& buffer = slot_buffers[buffer_id]; + if (buffer.IsInBounds(cpu_addr, size)) { + return buffer_id; + } + return CreateBuffer(cpu_addr, size); +} + +template +BufferId BufferCache

::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { + std::vector overlap_ids; + VAddr cpu_addr_begin = cpu_addr; + VAddr cpu_addr_end = cpu_addr + wanted_size; + for (; cpu_addr >> PAGE_BITS < Common::DivCeil(cpu_addr_end, PAGE_SIZE); + cpu_addr += PAGE_SIZE) { + const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; + if (!overlap_id) { + continue; + } + Buffer& overlap = slot_buffers[overlap_id]; + if (overlap.IsPicked()) { + continue; + } + overlap.Pick(); + overlap_ids.push_back(overlap_id); + const VAddr overlap_cpu_addr = overlap.CpuAddr(); + if (overlap_cpu_addr < cpu_addr_begin) { + cpu_addr = cpu_addr_begin = overlap_cpu_addr; } + cpu_addr_end = std::max(cpu_addr_end, overlap_cpu_addr + overlap.SizeBytes()); } - - void UnmarkRegionAsWritten(VAddr start, VAddr end) { - const u64 page_end = end >> WRITE_PAGE_BIT; - for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - auto it = written_pages.find(page_start); - if (it != written_pages.end()) { - if (it->second > 1) { - --it->second; - } else { - written_pages.erase(it); - } - } + const u32 size = static_cast(cpu_addr_end - cpu_addr_begin); + const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, cpu_addr_begin, size); + Buffer& new_buffer = slot_buffers[new_buffer_id]; + + for (const BufferId overlap_id : overlap_ids) { + Buffer& overlap = slot_buffers[overlap_id]; + overlap.Unpick(); + + std::vector copies; + const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); + overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = begin, + .dst_offset = dst_base_offset + begin, + .size = range_size, + }); + new_buffer.UnmarkRegionAsCpuModified(begin, range_size); + new_buffer.MarkRegionAsGpuModified(begin, range_size); + }); + if (!copies.empty()) { + runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); + } + ReplaceBufferDownloads(overlap_id, new_buffer_id); + DeleteBuffer(overlap_id); + } + Register(new_buffer_id); + return new_buffer_id; +} + +template +void BufferCache

::Register(BufferId buffer_id) { + ChangeRegister(buffer_id); +} + +template +void BufferCache

::Unregister(BufferId buffer_id) { + ChangeRegister(buffer_id); +} + +template +template +void BufferCache

::ChangeRegister(BufferId buffer_id) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr_begin = buffer.CpuAddr(); + const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); + const u64 page_begin = cpu_addr_begin / PAGE_SIZE; + const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); + for (u64 page = page_begin; page != page_end; ++page) { + if constexpr (insert) { + page_table[page] = buffer_id; + } else { + page_table[page] = BufferId{}; } } +} - bool IsRegionWritten(VAddr start, VAddr end) const { - const u64 page_end = end >> WRITE_PAGE_BIT; - for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - if (written_pages.contains(page_start)) { - return true; +template +void BufferCache

::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { + if (buffer.CpuAddr() == 0) { + return; + } + SynchronizeBufferImpl(buffer, cpu_addr, size); +} + +template +void BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = total_size_bytes, + .dst_offset = range_offset, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + if (total_size_bytes == 0) { + return; + } + const std::span copies_span(copies.data(), copies.size()); + UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); +} + +template +void BufferCache

::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span copies) { + if constexpr (USE_MEMORY_MAPS) { + MappedUploadMemory(buffer, total_size_bytes, copies); + } else { + ImmediateUploadMemory(buffer, largest_copy, copies); + } +} + +template +void BufferCache

::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span copies) { + std::span immediate_buffer; + for (const BufferCopy& copy : copies) { + std::span upload_span; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + if (IsRangeGranular(cpu_addr, copy.size)) { + upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); + } else { + if (immediate_buffer.empty()) { + immediate_buffer = ImmediateBuffer(largest_copy); } + cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + upload_span = immediate_buffer.subspan(0, copy.size); } - return false; + buffer.ImmediateUpload(copy.dst_offset, upload_span); } - - void QueueDestruction(std::shared_ptr buffer) { - buffer->SetEpoch(epoch); - pending_destruction.push(std::move(buffer)); +} + +template +void BufferCache

::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, + std::span copies) { + auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); + const std::span staging_pointer = upload_staging.mapped_span; + for (const BufferCopy& copy : copies) { + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + u8* const src_pointer = staging_pointer.data() + copy.src_offset; + cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); } - - void MarkForAsyncFlush(MapInterval* map) { - if (!uncommitted_flushes) { - uncommitted_flushes = std::make_shared>(); + runtime.CopyBuffer(buffer, upload_staging.buffer, copies); +} + +template +void BufferCache

::DeleteBuffer(BufferId buffer_id) { + const auto scalar_replace = [buffer_id](Binding& binding) { + if (binding.buffer_id == buffer_id) { + binding.buffer_id = BufferId{}; + } + }; + const auto replace = [scalar_replace](std::span bindings) { + std::ranges::for_each(bindings, scalar_replace); + }; + scalar_replace(index_buffer); + replace(vertex_buffers); + std::ranges::for_each(uniform_buffers, replace); + std::ranges::for_each(storage_buffers, replace); + replace(transform_feedback_buffers); + replace(compute_uniform_buffers); + replace(compute_storage_buffers); + std::erase(cached_write_buffer_ids, buffer_id); + + // Mark the whole buffer as CPU written to stop tracking CPU writes + Buffer& buffer = slot_buffers[buffer_id]; + buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + + Unregister(buffer_id); + delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); + + NotifyBufferDeletion(); +} + +template +void BufferCache

::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) { + const auto replace = [old_buffer_id, new_buffer_id](std::vector& buffers) { + std::ranges::replace(buffers, old_buffer_id, new_buffer_id); + if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) { + buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end()); } - uncommitted_flushes->insert(map); + }; + replace(uncommitted_downloads); + std::ranges::for_each(committed_downloads, replace); +} + +template +void BufferCache

::NotifyBufferDeletion() { + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + dirty_uniform_buffers.fill(~u32{0}); } + auto& flags = maxwell3d.dirty.flags; + flags[Dirty::IndexBuffer] = true; + flags[Dirty::VertexBuffers] = true; + for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { + flags[Dirty::VertexBuffer0 + index] = true; + } + has_deleted_buffers = true; +} + +template +typename BufferCache

::Binding BufferCache

::StorageBufferBinding(GPUVAddr ssbo_addr) const { + const GPUVAddr gpu_addr = gpu_memory.Read(ssbo_addr); + const u32 size = gpu_memory.Read(ssbo_addr + 8); + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr || size == 0) { + return NULL_BINDING; + } + const Binding binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = BufferId{}, + }; + return binding; +} + +template +std::span BufferCache

::ImmediateBufferWithData(VAddr cpu_addr, size_t size) { + u8* const base_pointer = cpu_memory.GetPointer(cpu_addr); + if (IsRangeGranular(cpu_addr, size) || + base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) { + return std::span(base_pointer, size); + } else { + const std::span span = ImmediateBuffer(size); + cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); + return span; + } +} - VideoCore::RasterizerInterface& rasterizer; - Tegra::MemoryManager& gpu_memory; - Core::Memory::Memory& cpu_memory; - StreamBuffer& stream_buffer; - - u8* buffer_ptr = nullptr; - u64 buffer_offset = 0; - u64 buffer_offset_base = 0; - - MapIntervalAllocator mapped_addresses_allocator; - boost::intrusive::set> - mapped_addresses; - - std::unordered_map written_pages; - std::unordered_map> blocks; - - std::queue> pending_destruction; - u64 epoch = 0; - u64 modified_ticks = 0; - - std::vector staging_buffer; - - std::list marked_for_unregister; - - std::shared_ptr> uncommitted_flushes; - std::list>> committed_flushes; - - std::recursive_mutex mutex; -}; +template +std::span BufferCache

::ImmediateBuffer(size_t wanted_capacity) { + if (wanted_capacity > immediate_buffer_capacity) { + immediate_buffer_capacity = wanted_capacity; + immediate_buffer_alloc = std::make_unique(wanted_capacity); + } + return std::span(immediate_buffer_alloc.get(), wanted_capacity); +} + +template +bool BufferCache

::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept { + if constexpr (IS_OPENGL) { + return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0; + } else { + // Only OpenGL has fast uniform buffers + return false; + } +} } // namespace VideoCommon diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp deleted file mode 100644 index 62587e18a..000000000 --- a/src/video_core/buffer_cache/map_interval.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2020 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include -#include -#include -#include - -#include "video_core/buffer_cache/map_interval.h" - -namespace VideoCommon { - -MapIntervalAllocator::MapIntervalAllocator() { - FillFreeList(first_chunk); -} - -MapIntervalAllocator::~MapIntervalAllocator() = default; - -void MapIntervalAllocator::AllocateNewChunk() { - *new_chunk = std::make_unique(); - FillFreeList(**new_chunk); - new_chunk = &(*new_chunk)->next; -} - -void MapIntervalAllocator::FillFreeList(Chunk& chunk) { - const std::size_t old_size = free_list.size(); - free_list.resize(old_size + chunk.data.size()); - std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size, - [](MapInterval& interval) { return &interval; }); -} - -} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h deleted file mode 100644 index ef974b08a..000000000 --- a/src/video_core/buffer_cache/map_interval.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include -#include -#include -#include - -#include - -#include "common/common_types.h" -#include "video_core/gpu.h" - -namespace VideoCommon { - -struct MapInterval : public boost::intrusive::set_base_hook> { - MapInterval() = default; - - /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {} - - explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept - : start{start_}, end{end_}, gpu_addr{gpu_addr_} {} - - bool IsInside(VAddr other_start, VAddr other_end) const noexcept { - return start <= other_start && other_end <= end; - } - - bool Overlaps(VAddr other_start, VAddr other_end) const noexcept { - return start < other_end && other_start < end; - } - - void MarkAsModified(bool is_modified_, u64 ticks_) noexcept { - is_modified = is_modified_; - ticks = ticks_; - } - - boost::intrusive::set_member_hook<> member_hook_; - VAddr start = 0; - VAddr end = 0; - GPUVAddr gpu_addr = 0; - u64 ticks = 0; - bool is_written = false; - bool is_modified = false; - bool is_registered = false; - bool is_memory_marked = false; - bool is_sync_pending = false; -}; - -struct MapIntervalCompare { - constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept { - return lhs.start < rhs.start; - } -}; - -class MapIntervalAllocator { -public: - MapIntervalAllocator(); - ~MapIntervalAllocator(); - - MapInterval* Allocate() { - if (free_list.empty()) { - AllocateNewChunk(); - } - MapInterval* const interval = free_list.back(); - free_list.pop_back(); - return interval; - } - - void Release(MapInterval* interval) { - free_list.push_back(interval); - } - -private: - struct Chunk { - std::unique_ptr next; - std::array data; - }; - - void AllocateNewChunk(); - - void FillFreeList(Chunk& chunk); - - std::vector free_list; - - Chunk first_chunk; - - std::unique_ptr* new_chunk = &first_chunk.next; -}; - -} // namespace VideoCommon -- cgit v1.2.3