From 0f89828073a541eaa2cfd985483f839bd2f97b74 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 9 Feb 2022 15:39:40 +0100 Subject: MacroHLE: Implement DrawIndexedIndirect & DrawArraysIndirect. --- src/video_core/buffer_cache/buffer_cache.h | 160 +++++++++++++++++++++++++---- 1 file changed, 140 insertions(+), 20 deletions(-) (limited to 'src/video_core/buffer_cache/buffer_cache.h') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 99abe0edf..557227b37 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -171,7 +171,9 @@ public: bool is_written, bool is_image); [[nodiscard]] std::pair ObtainBuffer(GPUVAddr gpu_addr, u32 size, - bool synchronize, bool mark_as_written); + bool synchronize = true, + bool mark_as_written = false, + bool discard_downloads = false); void FlushCachedWrites(); @@ -203,6 +205,14 @@ public: /// Return true when a CPU region is modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + void SetDrawIndirect(const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { + current_draw_indirect = current_draw_indirect_; + } + + [[nodiscard]] std::pair GetDrawIndirectCount(); + + [[nodiscard]] std::pair GetDrawIndirectBuffer(); + std::mutex mutex; Runtime& runtime; @@ -275,6 +285,8 @@ private: void BindHostVertexBuffers(); + void BindHostDrawIndirectBuffers(); + void BindHostGraphicsUniformBuffers(size_t stage); void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); @@ -301,6 +313,8 @@ private: void UpdateVertexBuffer(u32 index); + void UpdateDrawIndirect(); + void UpdateUniformBuffers(size_t stage); void UpdateStorageBuffers(size_t stage); @@ -340,6 +354,8 @@ private: bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); + bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size); + void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span copies); @@ -375,6 +391,8 @@ private: SlotVector slot_buffers; DelayedDestructionRing delayed_destruction_ring; + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; + u32 last_index_count = 0; Binding index_buffer; @@ -383,6 +401,8 @@ private: std::array, NUM_STAGES> storage_buffers; std::array, NUM_STAGES> texture_buffers; std::array transform_feedback_buffers; + Binding count_buffer_binding; + Binding indirect_buffer_binding; std::array compute_uniform_buffers; std::array compute_storage_buffers; @@ -422,6 +442,7 @@ private: std::vector cached_write_buffer_ids; + IntervalSet discarded_ranges; IntervalSet uncommitted_ranges; IntervalSet common_ranges; std::deque committed_ranges; @@ -579,13 +600,17 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am }}; boost::container::small_vector tmp_intervals; + const bool is_high_accuracy = + Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; auto mirror = [&](VAddr base_address, VAddr base_address_end) { const u64 size = base_address_end - base_address; const VAddr diff = base_address - *cpu_src_address; const VAddr new_base_address = *cpu_dest_address + diff; const IntervalType add_interval{new_base_address, new_base_address + size}; - uncommitted_ranges.add(add_interval); tmp_intervals.push_back(add_interval); + if (is_high_accuracy) { + uncommitted_ranges.add(add_interval); + } }; ForEachWrittenRange(*cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. @@ -677,6 +702,9 @@ void BufferCache

::BindHostGeometryBuffers(bool is_indexed) { } BindHostVertexBuffers(); BindHostTransformFeedbackBuffers(); + if (current_draw_indirect) { + BindHostDrawIndirectBuffers(); + } } template @@ -796,7 +824,8 @@ void BufferCache

::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add template std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_addr, u32 size, bool synchronize, - bool mark_as_written) { + bool mark_as_written, + bool discard_downloads) { const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); if (!cpu_addr) { return {&slot_buffers[NULL_BUFFER_ID], 0}; @@ -804,11 +833,17 @@ std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_ad const BufferId buffer_id = FindBuffer(*cpu_addr, size); Buffer& buffer = slot_buffers[buffer_id]; if (synchronize) { - SynchronizeBuffer(buffer, *cpu_addr, size); + // SynchronizeBuffer(buffer, *cpu_addr, size); + SynchronizeBufferNoModified(buffer, *cpu_addr, size); } if (mark_as_written) { MarkWrittenBuffer(buffer_id, *cpu_addr, size); } + if (discard_downloads) { + IntervalType interval{*cpu_addr, size}; + ClearDownload(interval); + discarded_ranges.subtract(interval); + } return {&buffer, buffer.Offset(*cpu_addr)}; } @@ -827,10 +862,6 @@ bool BufferCache

::HasUncommittedFlushes() const noexcept { template void BufferCache

::AccumulateFlushes() { - if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) { - uncommitted_ranges.clear(); - return; - } if (uncommitted_ranges.empty()) { return; } @@ -845,12 +876,15 @@ bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { template void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); + + for (const auto& interval : discarded_ranges) { + common_ranges.subtract(interval); + } + if (committed_ranges.empty()) { return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); - const bool is_accuracy_normal = - Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; auto it = committed_ranges.begin(); while (it != committed_ranges.end()) { @@ -875,9 +909,6 @@ void BufferCache

::CommitAsyncFlushesHigh() { ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { buffer.ForEachDownloadRangeAndClear( cpu_addr, size, [&](u64 range_offset, u64 range_size) { - if (is_accuracy_normal) { - return; - } const VAddr buffer_addr = buffer.CpuAddr(); const auto add_download = [&](VAddr start, VAddr end) { const u64 new_offset = start - buffer_addr; @@ -891,7 +922,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { buffer_id, }); // Align up to avoid cache conflicts - constexpr u64 align = 256ULL; + constexpr u64 align = 8ULL; constexpr u64 mask = ~(align - 1ULL); total_size_bytes += (new_size + align - 1) & mask; largest_copy = std::max(largest_copy, new_size); @@ -942,12 +973,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { template void BufferCache

::CommitAsyncFlushes() { - if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { - CommitAsyncFlushesHigh(); - } else { - uncommitted_ranges.clear(); - committed_ranges.clear(); - } + CommitAsyncFlushesHigh(); } template @@ -1063,6 +1089,19 @@ void BufferCache

::BindHostVertexBuffers() { } } +template +void BufferCache

::BindHostDrawIndirectBuffers() { + const auto bind_buffer = [this](const Binding& binding) { + Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer, binding.buffer_id); + SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); + }; + if (current_draw_indirect->include_count) { + bind_buffer(count_buffer_binding); + } + bind_buffer(indirect_buffer_binding); +} + template void BufferCache

::BindHostGraphicsUniformBuffers(size_t stage) { u32 dirty = ~0U; @@ -1294,6 +1333,9 @@ void BufferCache

::DoUpdateGraphicsBuffers(bool is_indexed) { UpdateStorageBuffers(stage); UpdateTextureBuffers(stage); } + if (current_draw_indirect) { + UpdateDrawIndirect(); + } } while (has_deleted_buffers); } @@ -1383,6 +1425,27 @@ void BufferCache

::UpdateVertexBuffer(u32 index) { }; } +template +void BufferCache

::UpdateDrawIndirect() { + const auto update = [this](GPUVAddr gpu_addr, size_t size, Binding& binding) { + const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + binding = NULL_BINDING; + return; + } + binding = Binding{ + .cpu_addr = *cpu_addr, + .size = static_cast(size), + .buffer_id = FindBuffer(*cpu_addr, static_cast(size)), + }; + }; + if (current_draw_indirect->include_count) { + update(current_draw_indirect->count_start_address, sizeof(u32), count_buffer_binding); + } + update(current_draw_indirect->indirect_start_address, current_draw_indirect->buffer_size, + indirect_buffer_binding); +} + template void BufferCache

::UpdateUniformBuffers(size_t stage) { ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) { @@ -1704,6 +1767,51 @@ bool BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s return false; } +template +bool BufferCache

::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + IntervalSet found_sets{}; + auto make_copies = [&] { + for (auto& interval : found_sets) { + const std::size_t sub_size = interval.upper() - interval.lower(); + const VAddr cpu_addr = interval.lower(); + copies.push_back(BufferCopy{ + .src_offset = total_size_bytes, + .dst_offset = cpu_addr - buffer.CpuAddr(), + .size = sub_size, + }); + total_size_bytes += sub_size; + largest_copy = std::max(largest_copy, sub_size); + } + const std::span copies_span(copies.data(), copies.size()); + UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); + }; + buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + const VAddr base_adr = buffer.CpuAddr() + range_offset; + const VAddr end_adr = base_adr + range_size; + const IntervalType add_interval{base_adr, end_adr}; + found_sets.add(add_interval); + }); + if (found_sets.empty()) { + return true; + } + const IntervalType search_interval{cpu_addr, cpu_addr + size}; + auto it = common_ranges.lower_bound(search_interval); + auto it_end = common_ranges.upper_bound(search_interval); + if (it == common_ranges.end()) { + make_copies(); + return false; + } + while (it != it_end) { + found_sets.subtract(*it); + it++; + } + make_copies(); + return false; +} + template void BufferCache

::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span copies) { @@ -1963,4 +2071,16 @@ bool BufferCache

::HasFastUniformBufferBound(size_t stage, u32 binding_index) } } +template +std::pair::Buffer*, u32> BufferCache

::GetDrawIndirectCount() { + auto& buffer = slot_buffers[count_buffer_binding.buffer_id]; + return std::make_pair(&buffer, buffer.Offset(count_buffer_binding.cpu_addr)); +} + +template +std::pair::Buffer*, u32> BufferCache

::GetDrawIndirectBuffer() { + auto& buffer = slot_buffers[indirect_buffer_binding.buffer_id]; + return std::make_pair(&buffer, buffer.Offset(indirect_buffer_binding.cpu_addr)); +} + } // namespace VideoCommon -- cgit v1.2.3