diff options
Diffstat (limited to 'src/video_core/renderer_vulkan')
-rw-r--r-- | src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 3 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_compute_pass.cpp | 181 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_compute_pass.h | 31 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_fence_manager.h | 2 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_query_cache.cpp | 1593 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_query_cache.h | 106 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_rasterizer.cpp | 107 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_rasterizer.h | 14 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_scheduler.cpp | 9 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_scheduler.h | 12 |
10 files changed, 1816 insertions, 242 deletions
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index e15865d16..d8148e89a 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -61,6 +61,9 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo if (device.IsExtTransformFeedbackSupported()) { flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } + if (device.IsExtConditionalRendering()) { + flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT; + } const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 54ee030ce..289d5b25c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -12,6 +12,9 @@ #include "common/common_types.h" #include "common/div_ceil.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h" +#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" +#include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h" +#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" @@ -57,6 +60,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE }, }}; +constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 2, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, +}}; + constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .uniform_buffers = 0, .storage_buffers = 2, @@ -67,6 +94,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .score = 2, }; +constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{ + .uniform_buffers = 0, + .storage_buffers = 3, + .texture_buffers = 0, + .image_buffers = 0, + .textures = 0, + .images = 0, + .score = 3, +}; + constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ { .binding = ASTC_BINDING_INPUT_BUFFER, @@ -103,6 +140,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT .stride = sizeof(DescriptorUpdateEntry), }; +constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{ + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 3, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 0, + .stride = sizeof(DescriptorUpdateEntry), +}; + constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ { @@ -131,13 +177,21 @@ struct AstcPushConstants { u32 block_height; u32 block_height_mask; }; + +struct QueriesPrefixScanPushConstants { + u32 min_accumulation_base; + u32 max_accumulation_base; + u32 accumulation_limit; + u32 buffer_offset; +}; } // Anonymous namespace ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorUpdateTemplateEntry> templates, const DescriptorBankInfo& bank_info, - vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code) + vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, + std::optional<u32> optional_subgroup_size) : device{device_} { descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, @@ -178,13 +232,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, .pCode = code.data(), }); device.SaveShader(code); + const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, + .pNext = nullptr, + .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U, + }; + bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size; pipeline = device.GetLogical().CreateComputePipeline({ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .pNext = nullptr, .flags = 0, .stage{ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, - .pNext = nullptr, + .pNext = use_setup_size ? &subgroup_size_ci : nullptr, .flags = 0, .stage = VK_SHADER_STAGE_COMPUTE_BIT, .module = *module, @@ -302,6 +362,123 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble( return {staging.buffer, staging.offset}; } +ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS, + INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr, + RESOLVE_CONDITIONAL_RENDER_COMP_SPV), + scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} + +void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, + u32 src_offset, bool compare_to_zero) { + const size_t compare_size = compare_to_zero ? 8 : 24; + + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.Dispatch(1, 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); + }); +} + +QueriesPrefixScanPass::QueriesPrefixScanPass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass( + device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, + QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, + COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && + device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) + ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) + : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)), + scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} + +void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, + VkBuffer src_buffer, size_t number_of_sums, + size_t min_accumulation_limit, size_t max_accumulation_limit) { + size_t current_runs = number_of_sums; + size_t offset = 0; + while (current_runs != 0) { + static constexpr size_t DISPATCH_SIZE = 2048U; + size_t runs_to_do = std::min<size_t>(current_runs, DISPATCH_SIZE); + current_runs -= runs_to_do; + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + size_t used_offset = offset; + offset += runs_to_do; + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit, + runs_to_do, used_offset](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | + VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | + VK_ACCESS_UNIFORM_READ_BIT | + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const QueriesPrefixScanPushConstants uniforms{ + .min_accumulation_base = static_cast<u32>(min_accumulation_limit), + .max_accumulation_base = static_cast<u32>(max_accumulation_limit), + .accumulation_limit = static_cast<u32>(runs_to_do - 1), + .buffer_offset = static_cast<u32>(used_offset), + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); + cmdbuf.Dispatch(1, 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, + write_barrier); + }); + } +} + ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index dd3927376..3ff935639 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -3,6 +3,7 @@ #pragma once +#include <optional> #include <span> #include <utility> @@ -31,7 +32,8 @@ public: vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorUpdateTemplateEntry> templates, const DescriptorBankInfo& bank_info, - vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); + vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code, + std::optional<u32> optional_subgroup_size = std::nullopt); ~ComputePass(); protected: @@ -82,6 +84,33 @@ private: ComputePassDescriptorQueue& compute_pass_descriptor_queue; }; +class ConditionalRenderingResolvePass final : public ComputePass { +public: + explicit ConditionalRenderingResolvePass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + + void Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero); + +private: + Scheduler& scheduler; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; +}; + +class QueriesPrefixScanPass final : public ComputePass { +public: + explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + + void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, + size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit); + +private: + Scheduler& scheduler; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; +}; + class ASTCDecoderPass final : public ComputePass { public: explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 145359d4e..336573574 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -7,6 +7,7 @@ #include "video_core/fence_manager.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" namespace Core { @@ -20,7 +21,6 @@ class RasterizerInterface; namespace Vulkan { class Device; -class QueryCache; class Scheduler; class InnerFence : public VideoCommon::FenceBase { diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 29e0b797b..a32da3ba3 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -1,139 +1,1552 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later -#include <algorithm> #include <cstddef> +#include <limits> +#include <map> +#include <memory> +#include <span> +#include <type_traits> +#include <unordered_map> #include <utility> #include <vector> +#include "common/bit_util.h" +#include "common/common_types.h" +#include "core/memory.h" +#include "video_core/engines/draw_manager.h" +#include "video_core/query_cache/query_cache.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -using VideoCore::QueryType; +using Tegra::Engines::Maxwell3D; +using VideoCommon::QueryType; namespace { +class SamplesQueryBank : public VideoCommon::BankBase { +public: + static constexpr size_t BANK_SIZE = 256; + static constexpr size_t QUERY_SIZE = 8; + explicit SamplesQueryBank(const Device& device_, size_t index_) + : BankBase(BANK_SIZE), device{device_}, index{index_} { + const auto& dev = device.GetLogical(); + query_pool = dev.CreateQueryPool({ + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .queryType = VK_QUERY_TYPE_OCCLUSION, + .queryCount = BANK_SIZE, + .pipelineStatistics = 0, + }); + Reset(); + } -constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION}; + ~SamplesQueryBank() = default; -constexpr VkQueryType GetTarget(QueryType type) { - return QUERY_TARGETS[static_cast<std::size_t>(type)]; -} + void Reset() override { + ASSERT(references == 0); + VideoCommon::BankBase::Reset(); + const auto& dev = device.GetLogical(); + dev.ResetQueryPool(*query_pool, 0, BANK_SIZE); + host_results.fill(0ULL); + next_bank = 0; + } + + void Sync(size_t start, size_t size) { + const auto& dev = device.GetLogical(); + const VkResult query_result = dev.GetQueryResults( + *query_pool, static_cast<u32>(start), static_cast<u32>(size), sizeof(u64) * size, + &host_results[start], sizeof(u64), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + switch (query_result) { + case VK_SUCCESS: + return; + case VK_ERROR_DEVICE_LOST: + device.ReportLoss(); + [[fallthrough]]; + default: + throw vk::Exception(query_result); + } + } + + VkQueryPool GetInnerPool() { + return *query_pool; + } + + size_t GetIndex() const { + return index; + } + + const std::array<u64, BANK_SIZE>& GetResults() const { + return host_results; + } + + size_t next_bank; + +private: + const Device& device; + const size_t index; + vk::QueryPool query_pool; + std::array<u64, BANK_SIZE> host_results; +}; + +using BaseStreamer = VideoCommon::SimpleStreamer<VideoCommon::HostQueryBase>; + +struct HostSyncValues { + VAddr address; + size_t size; + size_t offset; + + static constexpr bool GeneratesBaseBuffer = false; +}; + +class SamplesStreamer : public BaseStreamer { +public: + explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, + VideoCore::RasterizerInterface* rasterizer_, const Device& device_, + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) + : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, + scheduler{scheduler_}, memory_allocator{memory_allocator_} { + current_bank = nullptr; + current_query = nullptr; + ammend_value = 0; + acumulation_value = 0; + queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>( + device, scheduler, descriptor_pool, compute_pass_descriptor_queue); + + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = 8, + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); + }); + } + + ~SamplesStreamer() = default; + + void StartCounter() override { + if (has_started) { + return; + } + ReserveHostQuery(); + scheduler.Record([query_pool = current_query_pool, + query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { + const bool use_precise = Settings::IsGPULevelHigh(); + cmdbuf.BeginQuery(query_pool, static_cast<u32>(query_index), + use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); + }); + has_started = true; + } + + void PauseCounter() override { + if (!has_started) { + return; + } + scheduler.Record([query_pool = current_query_pool, + query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { + cmdbuf.EndQuery(query_pool, static_cast<u32>(query_index)); + }); + has_started = false; + } + + void ResetCounter() override { + if (has_started) { + PauseCounter(); + } + AbandonCurrentQuery(); + std::function<void()> func([this, counts = pending_flush_queries.size()] { + ammend_value = 0; + acumulation_value = 0; + }); + rasterizer->SyncOperation(std::move(func)); + accumulation_since_last_sync = false; + first_accumulation_checkpoint = std::min(first_accumulation_checkpoint, num_slots_used); + last_accumulation_checkpoint = std::max(last_accumulation_checkpoint, num_slots_used); + } + + void CloseCounter() override { + PauseCounter(); + } + + bool HasPendingSync() const override { + return !pending_sync.empty(); + } + + void SyncWrites() override { + if (sync_values_stash.empty()) { + return; + } + + for (size_t i = 0; i < sync_values_stash.size(); i++) { + runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], + *buffers[resolve_buffers[i]]); + } + + sync_values_stash.clear(); + } + + void PresyncWrites() override { + if (pending_sync.empty()) { + return; + } + PauseCounter(); + sync_values_stash.clear(); + sync_values_stash.emplace_back(); + std::vector<HostSyncValues>* sync_values = &sync_values_stash.back(); + sync_values->reserve(num_slots_used); + std::unordered_map<size_t, std::pair<size_t, size_t>> offsets; + resolve_buffers.clear(); + size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used); + resolve_buffers.push_back(resolve_buffer_index); + size_t base_offset = 0; + + ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start, + size_t amount) { + size_t bank_id = bank->GetIndex(); + auto& resolve_buffer = buffers[resolve_buffer_index]; + VkQueryPool query_pool = bank->GetInnerPool(); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([start, amount, base_offset, query_pool, + buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { + const VkBufferMemoryBarrier copy_query_pool_barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = buffer, + .offset = base_offset, + .size = amount * SamplesQueryBank::QUERY_SIZE, + }; + + cmdbuf.CopyQueryPoolResults( + query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer, + static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE, + VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); + }); + offsets[bank_id] = {start, base_offset}; + base_offset += amount * SamplesQueryBank::QUERY_SIZE; + }); + + // Convert queries + bool has_multi_queries = false; + for (auto q : pending_sync) { + auto* query = GetQuery(q); + size_t sync_value_slot = 0; + if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { + continue; + } + if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { + continue; + } + if (accumulation_since_last_sync || query->size_slots > 1) { + if (!has_multi_queries) { + has_multi_queries = true; + sync_values_stash.emplace_back(); + } + sync_value_slot = 1; + } + query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; + auto loc_data = offsets[query->start_bank_id]; + sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{ + .address = query->guest_address, + .size = SamplesQueryBank::QUERY_SIZE, + .offset = + loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) * + SamplesQueryBank::QUERY_SIZE, + }); + } + + if (has_multi_queries) { + size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used); + resolve_buffers.push_back(intermediary_buffer_index); + queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], + *buffers[resolve_buffer_index], num_slots_used, + std::min(first_accumulation_checkpoint, num_slots_used), + last_accumulation_checkpoint); + + } else { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); + }); + } + + ReplicateCurrentQueryIfNeeded(); + std::function<void()> func([this] { ammend_value = acumulation_value; }); + rasterizer->SyncOperation(std::move(func)); + AbandonCurrentQuery(); + num_slots_used = 0; + first_accumulation_checkpoint = std::numeric_limits<size_t>::max(); + last_accumulation_checkpoint = 0; + accumulation_since_last_sync = has_multi_queries; + pending_sync.clear(); + } + + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + [[maybe_unused]] std::optional<u32> subreport) override { + PauseCounter(); + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = address; + new_query->value = 0; + new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; + if (has_timestamp) { + new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + if (!current_query) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + new_query->start_bank_id = current_query->start_bank_id; + new_query->size_banks = current_query->size_banks; + new_query->start_slot = current_query->start_slot; + new_query->size_slots = current_query->size_slots; + ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { + bank->AddReference(amount); + }); + pending_sync.push_back(index); + pending_flush_queries.push_back(index); + return index; + } + + bool HasUnsyncedQueries() const override { + return !pending_flush_queries.empty(); + } + + void PushUnsyncedQueries() override { + PauseCounter(); + current_bank->Close(); + { + std::scoped_lock lk(flush_guard); + pending_flush_sets.emplace_back(std::move(pending_flush_queries)); + } + } + + void PopUnsyncedQueries() override { + std::vector<size_t> current_flush_queries; + { + std::scoped_lock lk(flush_guard); + current_flush_queries = std::move(pending_flush_sets.front()); + pending_flush_sets.pop_front(); + } + ApplyBanksWideOp<false>( + current_flush_queries, + [](SamplesQueryBank* bank, size_t start, size_t amount) { bank->Sync(start, amount); }); + for (auto q : current_flush_queries) { + auto* query = GetQuery(q); + u64 total = 0; + ApplyBankOp(query, [&total](SamplesQueryBank* bank, size_t start, size_t amount) { + const auto& results = bank->GetResults(); + for (size_t i = 0; i < amount; i++) { + total += results[start + i]; + } + }); + query->value = total; + query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + } + } + +private: + template <typename Func> + void ApplyBankOp(VideoCommon::HostQueryBase* query, Func&& func) { + size_t size_slots = query->size_slots; + if (size_slots == 0) { + return; + } + size_t bank_id = query->start_bank_id; + size_t banks_set = query->size_banks; + size_t start_slot = query->start_slot; + for (size_t i = 0; i < banks_set; i++) { + auto& the_bank = bank_pool.GetBank(bank_id); + size_t amount = std::min(the_bank.Size() - start_slot, size_slots); + func(&the_bank, start_slot, amount); + bank_id = the_bank.next_bank - 1; + start_slot = 0; + size_slots -= amount; + } + } + + template <bool is_ordered, typename Func> + void ApplyBanksWideOp(std::vector<size_t>& queries, Func&& func) { + std::conditional_t<is_ordered, std::map<size_t, std::pair<size_t, size_t>>, + std::unordered_map<size_t, std::pair<size_t, size_t>>> + indexer; + for (auto q : queries) { + auto* query = GetQuery(q); + ApplyBankOp(query, [&indexer](SamplesQueryBank* bank, size_t start, size_t amount) { + auto id_ = bank->GetIndex(); + auto pair = indexer.try_emplace(id_, std::numeric_limits<size_t>::max(), + std::numeric_limits<size_t>::min()); + auto& current_pair = pair.first->second; + current_pair.first = std::min(current_pair.first, start); + current_pair.second = std::max(current_pair.second, amount + start); + }); + } + for (auto& cont : indexer) { + func(&bank_pool.GetBank(cont.first), cont.second.first, + cont.second.second - cont.second.first); + } + } + + void ReserveBank() { + current_bank_id = + bank_pool.ReserveBank([this](std::deque<SamplesQueryBank>& queue, size_t index) { + queue.emplace_back(device, index); + }); + if (current_bank) { + current_bank->next_bank = current_bank_id + 1; + } + current_bank = &bank_pool.GetBank(current_bank_id); + current_query_pool = current_bank->GetInnerPool(); + } + + size_t ReserveBankSlot() { + if (!current_bank || current_bank->IsClosed()) { + ReserveBank(); + } + auto [built, index] = current_bank->Reserve(); + current_bank_slot = index; + return index; + } + + void ReserveHostQuery() { + size_t new_slot = ReserveBankSlot(); + current_bank->AddReference(1); + num_slots_used++; + if (current_query) { + size_t bank_id = current_query->start_bank_id; + size_t banks_set = current_query->size_banks - 1; + bool found = bank_id == current_bank_id; + while (!found && banks_set > 0) { + SamplesQueryBank& some_bank = bank_pool.GetBank(bank_id); + bank_id = some_bank.next_bank - 1; + found = bank_id == current_bank_id; + banks_set--; + } + if (!found) { + current_query->size_banks++; + } + current_query->size_slots++; + } else { + current_query_id = BuildQuery(); + current_query = GetQuery(current_query_id); + current_query->start_bank_id = static_cast<u32>(current_bank_id); + current_query->size_banks = 1; + current_query->start_slot = new_slot; + current_query->size_slots = 1; + } + } + + void Free(size_t query_id) override { + std::scoped_lock lk(guard); + auto* query = GetQuery(query_id); + ApplyBankOp(query, [](SamplesQueryBank* bank, size_t start, size_t amount) { + bank->CloseReference(amount); + }); + ReleaseQuery(query_id); + } + + void AbandonCurrentQuery() { + if (!current_query) { + return; + } + Free(current_query_id); + current_query = nullptr; + current_query_id = 0; + } + + void ReplicateCurrentQueryIfNeeded() { + if (pending_sync.empty()) { + return; + } + if (!current_query) { + return; + } + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = 0; + new_query->value = 0; + new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; + new_query->start_bank_id = current_query->start_bank_id; + new_query->size_banks = current_query->size_banks; + new_query->start_slot = current_query->start_slot; + new_query->size_slots = current_query->size_slots; + ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { + bank->AddReference(amount); + }); + pending_flush_queries.push_back(index); + std::function<void()> func([this, index] { + auto* query = GetQuery(index); + query->value += GetAmmendValue(); + SetAccumulationValue(query->value); + Free(index); + }); + } + + template <bool is_resolve> + size_t ObtainBuffer(size_t num_needed) { + const size_t log_2 = std::max<size_t>(11U, Common::Log2Ceil64(num_needed)); + if constexpr (is_resolve) { + if (resolve_table[log_2] != 0) { + return resolve_table[log_2] - 1; + } + } else { + if (intermediary_table[log_2] != 0) { + return intermediary_table[log_2] - 1; + } + } + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2), + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); + if constexpr (is_resolve) { + resolve_table[log_2] = buffers.size(); + } else { + intermediary_table[log_2] = buffers.size(); + } + return buffers.size() - 1; + } + + QueryCacheRuntime& runtime; + VideoCore::RasterizerInterface* rasterizer; + const Device& device; + Scheduler& scheduler; + const MemoryAllocator& memory_allocator; + VideoCommon::BankPool<SamplesQueryBank> bank_pool; + std::deque<vk::Buffer> buffers; + std::array<size_t, 32> resolve_table{}; + std::array<size_t, 32> intermediary_table{}; + vk::Buffer accumulation_buffer; + std::deque<std::vector<HostSyncValues>> sync_values_stash; + std::vector<size_t> resolve_buffers; + + // syncing queue + std::vector<size_t> pending_sync; + + // flush levels + std::vector<size_t> pending_flush_queries; + std::deque<std::vector<size_t>> pending_flush_sets; + + // State Machine + size_t current_bank_slot; + size_t current_bank_id; + SamplesQueryBank* current_bank; + VkQueryPool current_query_pool; + size_t current_query_id; + size_t num_slots_used{}; + size_t first_accumulation_checkpoint{}; + size_t last_accumulation_checkpoint{}; + bool accumulation_since_last_sync{}; + VideoCommon::HostQueryBase* current_query; + bool has_started{}; + std::mutex flush_guard; + + std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass; +}; + +// Transform feedback queries +class TFBQueryBank : public VideoCommon::BankBase { +public: + static constexpr size_t BANK_SIZE = 1024; + static constexpr size_t QUERY_SIZE = 4; + explicit TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator, + size_t index_) + : BankBase(BANK_SIZE), scheduler{scheduler_}, index{index_} { + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = QUERY_SIZE * BANK_SIZE, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + } + + ~TFBQueryBank() = default; + + void Reset() override { + ASSERT(references == 0); + VideoCommon::BankBase::Reset(); + } + + void Sync(StagingBufferRef& stagging_buffer, size_t extra_offset, size_t start, size_t size) { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, dst_buffer = stagging_buffer.buffer, extra_offset, start, + size](vk::CommandBuffer cmdbuf) { + std::array<VkBufferCopy, 1> copy{VkBufferCopy{ + .srcOffset = start * QUERY_SIZE, + .dstOffset = extra_offset, + .size = size * QUERY_SIZE, + }}; + cmdbuf.CopyBuffer(*buffer, dst_buffer, copy); + }); + } + + size_t GetIndex() const { + return index; + } + + VkBuffer GetBuffer() const { + return *buffer; + } + +private: + Scheduler& scheduler; + const size_t index; + vk::Buffer buffer; +}; + +class PrimitivesSucceededStreamer; + +class TFBCounterStreamer : public BaseStreamer { +public: + explicit TFBCounterStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_, + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, + StagingBufferPool& staging_pool_) + : BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_}, + memory_allocator{memory_allocator_}, staging_pool{staging_pool_} { + buffers_count = 0; + current_bank = nullptr; + counter_buffers.fill(VK_NULL_HANDLE); + offsets.fill(0); + last_queries.fill(0); + last_queries_stride.fill(1); + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = TFBQueryBank::QUERY_SIZE * NUM_STREAMS, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + + counters_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + for (auto& c : counter_buffers) { + c = *counters_buffer; + } + size_t base_offset = 0; + for (auto& o : offsets) { + o = base_offset; + base_offset += TFBQueryBank::QUERY_SIZE; + } + } + + ~TFBCounterStreamer() = default; + + void StartCounter() override { + FlushBeginTFB(); + has_started = true; + } + + void PauseCounter() override { + CloseCounter(); + } + + void ResetCounter() override { + CloseCounter(); + } + + void CloseCounter() override { + if (has_flushed_end_pending) { + FlushEndTFB(); + } + runtime.View3DRegs([this](Maxwell3D& maxwell3d) { + if (maxwell3d.regs.transform_feedback_enabled == 0) { + streams_mask = 0; + has_started = false; + } + }); + } + + bool HasPendingSync() const override { + return !pending_sync.empty(); + } + + void SyncWrites() override { + CloseCounter(); + std::unordered_map<size_t, std::vector<HostSyncValues>> sync_values_stash; + for (auto q : pending_sync) { + auto* query = GetQuery(q); + if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { + continue; + } + if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { + continue; + } + query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; + sync_values_stash.try_emplace(query->start_bank_id); + sync_values_stash[query->start_bank_id].emplace_back(HostSyncValues{ + .address = query->guest_address, + .size = TFBQueryBank::QUERY_SIZE, + .offset = query->start_slot * TFBQueryBank::QUERY_SIZE, + }); + } + for (auto& p : sync_values_stash) { + auto& bank = bank_pool.GetBank(p.first); + runtime.template SyncValues<HostSyncValues>(p.second, bank.GetBuffer()); + } + pending_sync.clear(); + } -} // Anonymous namespace + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional<u32> subreport_) override { + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = address; + new_query->value = 0; + new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; + if (has_timestamp) { + new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + if (!subreport_) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + const size_t subreport = static_cast<size_t>(*subreport_); + last_queries[subreport] = address; + if ((streams_mask & (1ULL << subreport)) == 0) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + CloseCounter(); + auto [bank_slot, data_slot] = ProduceCounterBuffer(subreport); + new_query->start_bank_id = static_cast<u32>(bank_slot); + new_query->size_banks = 1; + new_query->start_slot = static_cast<u32>(data_slot); + new_query->size_slots = 1; + pending_sync.push_back(index); + pending_flush_queries.push_back(index); + return index; + } + + std::optional<std::pair<VAddr, size_t>> GetLastQueryStream(size_t stream) { + if (last_queries[stream] != 0) { + std::pair<VAddr, size_t> result(last_queries[stream], last_queries_stride[stream]); + return result; + } + return std::nullopt; + } + + Maxwell3D::Regs::PrimitiveTopology GetOutputTopology() const { + return out_topology; + } + + bool HasUnsyncedQueries() const override { + return !pending_flush_queries.empty(); + } + + void PushUnsyncedQueries() override { + CloseCounter(); + auto staging_ref = staging_pool.Request( + pending_flush_queries.size() * TFBQueryBank::QUERY_SIZE, MemoryUsage::Download, true); + size_t offset_base = staging_ref.offset; + for (auto q : pending_flush_queries) { + auto* query = GetQuery(q); + auto& bank = bank_pool.GetBank(query->start_bank_id); + bank.Sync(staging_ref, offset_base, query->start_slot, 1); + offset_base += TFBQueryBank::QUERY_SIZE; + bank.CloseReference(); + } + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); + }); + + std::scoped_lock lk(flush_guard); + for (auto& str : free_queue) { + staging_pool.FreeDeferred(str); + } + free_queue.clear(); + download_buffers.emplace_back(staging_ref); + pending_flush_sets.emplace_back(std::move(pending_flush_queries)); + } + + void PopUnsyncedQueries() override { + StagingBufferRef staging_ref; + std::vector<size_t> flushed_queries; + { + std::scoped_lock lk(flush_guard); + staging_ref = download_buffers.front(); + flushed_queries = std::move(pending_flush_sets.front()); + download_buffers.pop_front(); + pending_flush_sets.pop_front(); + } + + size_t offset_base = staging_ref.offset; + for (auto q : flushed_queries) { + auto* query = GetQuery(q); + u32 result = 0; + std::memcpy(&result, staging_ref.mapped_span.data() + offset_base, sizeof(u32)); + query->value = static_cast<u64>(result); + query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + offset_base += TFBQueryBank::QUERY_SIZE; + } + + { + std::scoped_lock lk(flush_guard); + free_queue.emplace_back(staging_ref); + } + } + +private: + void FlushBeginTFB() { + if (has_flushed_end_pending) [[unlikely]] { + return; + } + has_flushed_end_pending = true; + if (!has_started || buffers_count == 0) { + scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); + }); + UpdateBuffers(); + return; + } + scheduler.Record([this, total = static_cast<u32>(buffers_count)](vk::CommandBuffer cmdbuf) { + cmdbuf.BeginTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); + }); + UpdateBuffers(); + } + + void FlushEndTFB() { + if (!has_flushed_end_pending) [[unlikely]] { + UNREACHABLE(); + return; + } + has_flushed_end_pending = false; + + if (buffers_count == 0) { + scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); + }); + } else { + scheduler.Record([this, + total = static_cast<u32>(buffers_count)](vk::CommandBuffer cmdbuf) { + cmdbuf.EndTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); + }); + } + } + + void UpdateBuffers() { + last_queries.fill(0); + last_queries_stride.fill(1); + runtime.View3DRegs([this](Maxwell3D& maxwell3d) { + buffers_count = 0; + out_topology = maxwell3d.draw_manager->GetDrawState().topology; + for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { + const auto& tf = maxwell3d.regs.transform_feedback; + if (tf.buffers[i].enable == 0) { + continue; + } + const size_t stream = tf.controls[i].stream; + last_queries_stride[stream] = tf.controls[i].stride; + streams_mask |= 1ULL << stream; + buffers_count = std::max<size_t>(buffers_count, stream + 1); + } + }); + } + + std::pair<size_t, size_t> ProduceCounterBuffer(size_t stream) { + if (current_bank == nullptr || current_bank->IsClosed()) { + current_bank_id = + bank_pool.ReserveBank([this](std::deque<TFBQueryBank>& queue, size_t index) { + queue.emplace_back(scheduler, memory_allocator, index); + }); + current_bank = &bank_pool.GetBank(current_bank_id); + } + auto [dont_care, other] = current_bank->Reserve(); + const size_t slot = other; // workaround to compile bug. + current_bank->AddReference(); + + static constexpr VkMemoryBarrier READ_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + }; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT, + }; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([dst_buffer = current_bank->GetBuffer(), + src_buffer = counter_buffers[stream], src_offset = offsets[stream], + slot](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); + std::array<VkBufferCopy, 1> copy{VkBufferCopy{ + .srcOffset = src_offset, + .dstOffset = slot * TFBQueryBank::QUERY_SIZE, + .size = TFBQueryBank::QUERY_SIZE, + }}; + cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, WRITE_BARRIER); + }); + return {current_bank_id, slot}; + } + + friend class PrimitivesSucceededStreamer; + + static constexpr size_t NUM_STREAMS = 4; + + QueryCacheRuntime& runtime; + const Device& device; + Scheduler& scheduler; + const MemoryAllocator& memory_allocator; + StagingBufferPool& staging_pool; + VideoCommon::BankPool<TFBQueryBank> bank_pool; + size_t current_bank_id; + TFBQueryBank* current_bank; + vk::Buffer counters_buffer; + + // syncing queue + std::vector<size_t> pending_sync; + + // flush levels + std::vector<size_t> pending_flush_queries; + std::deque<StagingBufferRef> download_buffers; + std::deque<std::vector<size_t>> pending_flush_sets; + std::vector<StagingBufferRef> free_queue; + std::mutex flush_guard; + + // state machine + bool has_started{}; + bool has_flushed_end_pending{}; + size_t buffers_count{}; + std::array<VkBuffer, NUM_STREAMS> counter_buffers{}; + std::array<VkDeviceSize, NUM_STREAMS> offsets{}; + std::array<VAddr, NUM_STREAMS> last_queries; + std::array<size_t, NUM_STREAMS> last_queries_stride; + Maxwell3D::Regs::PrimitiveTopology out_topology; + u64 streams_mask; +}; + +class PrimitivesQueryBase : public VideoCommon::QueryBase { +public: + // Default constructor + PrimitivesQueryBase() + : VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0) {} + + // Parameterized constructor + PrimitivesQueryBase(bool has_timestamp, VAddr address) + : VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0) { + if (has_timestamp) { + flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + } + + u64 stride{}; + VAddr dependant_address{}; + Maxwell3D::Regs::PrimitiveTopology topology{Maxwell3D::Regs::PrimitiveTopology::Points}; + size_t dependant_index{}; + bool dependant_manage{}; +}; + +class PrimitivesSucceededStreamer : public VideoCommon::SimpleStreamer<PrimitivesQueryBase> { +public: + explicit PrimitivesSucceededStreamer(size_t id_, QueryCacheRuntime& runtime_, + TFBCounterStreamer& tfb_streamer_, + Core::Memory::Memory& cpu_memory_) + : VideoCommon::SimpleStreamer<PrimitivesQueryBase>(id_), runtime{runtime_}, + tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} { + MakeDependent(&tfb_streamer); + } + + ~PrimitivesSucceededStreamer() = default; + + size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, + std::optional<u32> subreport_) override { + auto index = BuildQuery(); + auto* new_query = GetQuery(index); + new_query->guest_address = address; + new_query->value = 0; + if (has_timestamp) { + new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; + } + if (!subreport_) { + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + return index; + } + const size_t subreport = static_cast<size_t>(*subreport_); + auto dependant_address_opt = tfb_streamer.GetLastQueryStream(subreport); + bool must_manage_dependance = false; + new_query->topology = tfb_streamer.GetOutputTopology(); + if (dependant_address_opt) { + auto [dep_address, stride] = *dependant_address_opt; + new_query->dependant_address = dep_address; + new_query->stride = stride; + } else { + new_query->dependant_index = + tfb_streamer.WriteCounter(address, has_timestamp, value, subreport_); + auto* dependant_query = tfb_streamer.GetQuery(new_query->dependant_index); + dependant_query->flags |= VideoCommon::QueryFlagBits::IsInvalidated; + must_manage_dependance = true; + if (True(dependant_query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { + new_query->value = 0; + new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + if (must_manage_dependance) { + tfb_streamer.Free(new_query->dependant_index); + } + return index; + } + new_query->stride = 1; + runtime.View3DRegs([new_query, subreport](Maxwell3D& maxwell3d) { + for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { + const auto& tf = maxwell3d.regs.transform_feedback; + if (tf.buffers[i].enable == 0) { + continue; + } + if (tf.controls[i].stream != subreport) { + continue; + } + new_query->stride = tf.controls[i].stride; + break; + } + }); + } + + new_query->dependant_manage = must_manage_dependance; + pending_flush_queries.push_back(index); + return index; + } + + bool HasUnsyncedQueries() const override { + return !pending_flush_queries.empty(); + } + + void PushUnsyncedQueries() override { + std::scoped_lock lk(flush_guard); + pending_flush_sets.emplace_back(std::move(pending_flush_queries)); + pending_flush_queries.clear(); + } + + void PopUnsyncedQueries() override { + std::vector<size_t> flushed_queries; + { + std::scoped_lock lk(flush_guard); + flushed_queries = std::move(pending_flush_sets.front()); + pending_flush_sets.pop_front(); + } + + for (auto q : flushed_queries) { + auto* query = GetQuery(q); + if (True(query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { + continue; + } + + query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; + u64 num_vertices = 0; + if (query->dependant_manage) { + auto* dependant_query = tfb_streamer.GetQuery(query->dependant_index); + num_vertices = dependant_query->value / query->stride; + tfb_streamer.Free(query->dependant_index); + } else { + u8* pointer = cpu_memory.GetPointer(query->dependant_address); + u32 result; + std::memcpy(&result, pointer, sizeof(u32)); + num_vertices = static_cast<u64>(result) / query->stride; + } + query->value = [&]() -> u64 { + switch (query->topology) { + case Maxwell3D::Regs::PrimitiveTopology::Points: + return num_vertices; + case Maxwell3D::Regs::PrimitiveTopology::Lines: + return num_vertices / 2; + case Maxwell3D::Regs::PrimitiveTopology::LineLoop: + return (num_vertices / 2) + 1; + case Maxwell3D::Regs::PrimitiveTopology::LineStrip: + return num_vertices - 1; + case Maxwell3D::Regs::PrimitiveTopology::Patches: + case Maxwell3D::Regs::PrimitiveTopology::Triangles: + case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: + return num_vertices / 3; + case Maxwell3D::Regs::PrimitiveTopology::TriangleFan: + case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: + case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: + return num_vertices - 2; + case Maxwell3D::Regs::PrimitiveTopology::Quads: + return num_vertices / 4; + case Maxwell3D::Regs::PrimitiveTopology::Polygon: + return 1U; + default: + return num_vertices; + } + }(); + } + } + +private: + QueryCacheRuntime& runtime; + TFBCounterStreamer& tfb_streamer; + Core::Memory::Memory& cpu_memory; + + // syncing queue + std::vector<size_t> pending_sync; + + // flush levels + std::vector<size_t> pending_flush_queries; + std::deque<std::vector<size_t>> pending_flush_sets; + std::mutex flush_guard; +}; + +} // namespace + +struct QueryCacheRuntimeImpl { + QueryCacheRuntimeImpl(QueryCacheRuntime& runtime, VideoCore::RasterizerInterface* rasterizer_, + Core::Memory::Memory& cpu_memory_, Vulkan::BufferCache& buffer_cache_, + const Device& device_, const MemoryAllocator& memory_allocator_, + Scheduler& scheduler_, StagingBufferPool& staging_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) + : rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, + buffer_cache{buffer_cache_}, device{device_}, + memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, + guest_streamer(0, runtime), + sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer, + device, scheduler, memory_allocator, compute_pass_descriptor_queue, + descriptor_pool), + tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device, + scheduler, memory_allocator, staging_pool), + primitives_succeeded_streamer( + static_cast<size_t>(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer, + cpu_memory_), + primitives_needed_minus_suceeded_streamer( + static_cast<size_t>(QueryType::StreamingPrimitivesNeededMinusSucceeded), runtime, 0u), + hcr_setup{}, hcr_is_set{}, is_hcr_running{} { -QueryPool::QueryPool(const Device& device_, Scheduler& scheduler, QueryType type_) - : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {} + hcr_setup.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT; + hcr_setup.pNext = nullptr; + hcr_setup.flags = 0; -QueryPool::~QueryPool() = default; + conditional_resolve_pass = std::make_unique<ConditionalRenderingResolvePass>( + device, scheduler, descriptor_pool, compute_pass_descriptor_queue); -std::pair<VkQueryPool, u32> QueryPool::Commit() { - std::size_t index; - do { - index = CommitResource(); - } while (usage[index]); - usage[index] = true; + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = sizeof(u32), + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + hcr_resolve_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + } + + VideoCore::RasterizerInterface* rasterizer; + Core::Memory::Memory& cpu_memory; + Vulkan::BufferCache& buffer_cache; + + const Device& device; + const MemoryAllocator& memory_allocator; + Scheduler& scheduler; + StagingBufferPool& staging_pool; + + // Streamers + VideoCommon::GuestStreamer<QueryCacheParams> guest_streamer; + SamplesStreamer sample_streamer; + TFBCounterStreamer tfb_streamer; + PrimitivesSucceededStreamer primitives_succeeded_streamer; + VideoCommon::StubStreamer<QueryCacheParams> primitives_needed_minus_suceeded_streamer; - return {*pools[index / GROW_STEP], static_cast<u32>(index % GROW_STEP)}; + std::vector<std::pair<VAddr, VAddr>> little_cache; + std::vector<std::pair<VkBuffer, VkDeviceSize>> buffers_to_upload_to; + std::vector<size_t> redirect_cache; + std::vector<std::vector<VkBufferCopy>> copies_setup; + + // Host conditional rendering data + std::unique_ptr<ConditionalRenderingResolvePass> conditional_resolve_pass; + vk::Buffer hcr_resolve_buffer; + VkConditionalRenderingBeginInfoEXT hcr_setup; + VkBuffer hcr_buffer; + size_t hcr_offset; + bool hcr_is_set; + bool is_hcr_running; + + // maxwell3d + Maxwell3D* maxwell3d; +}; + +QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, + Core::Memory::Memory& cpu_memory_, + Vulkan::BufferCache& buffer_cache_, const Device& device_, + const MemoryAllocator& memory_allocator_, + Scheduler& scheduler_, StagingBufferPool& staging_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) { + impl = std::make_unique<QueryCacheRuntimeImpl>( + *this, rasterizer, cpu_memory_, buffer_cache_, device_, memory_allocator_, scheduler_, + staging_pool_, compute_pass_descriptor_queue, descriptor_pool); } -void QueryPool::Allocate(std::size_t begin, std::size_t end) { - usage.resize(end); +void QueryCacheRuntime::Bind3DEngine(Maxwell3D* maxwell3d) { + impl->maxwell3d = maxwell3d; +} - pools.push_back(device.GetLogical().CreateQueryPool({ - .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .queryType = GetTarget(type), - .queryCount = static_cast<u32>(end - begin), - .pipelineStatistics = 0, - })); +template <typename Func> +void QueryCacheRuntime::View3DRegs(Func&& func) { + func(*impl->maxwell3d); +} + +void QueryCacheRuntime::EndHostConditionalRendering() { + PauseHostConditionalRendering(); + impl->hcr_is_set = false; + impl->is_hcr_running = false; + impl->hcr_buffer = nullptr; + impl->hcr_offset = 0; +} + +void QueryCacheRuntime::PauseHostConditionalRendering() { + if (!impl->hcr_is_set) { + return; + } + if (impl->is_hcr_running) { + impl->scheduler.Record( + [](vk::CommandBuffer cmdbuf) { cmdbuf.EndConditionalRenderingEXT(); }); + } + impl->is_hcr_running = false; } -void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { - const auto it = - std::find_if(pools.begin(), pools.end(), [query_pool = query.first](vk::QueryPool& pool) { - return query_pool == *pool; +void QueryCacheRuntime::ResumeHostConditionalRendering() { + if (!impl->hcr_is_set) { + return; + } + if (!impl->is_hcr_running) { + impl->scheduler.Record([hcr_setup = impl->hcr_setup](vk::CommandBuffer cmdbuf) { + cmdbuf.BeginConditionalRenderingEXT(hcr_setup); }); + } + impl->is_hcr_running = true; +} - if (it != std::end(pools)) { - const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); - usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; +void QueryCacheRuntime::HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, + bool is_equal) { + { + std::scoped_lock lk(impl->buffer_cache.mutex); + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = + impl->buffer_cache.ObtainCPUBuffer(object.address, 8, sync_info, post_op); + impl->hcr_buffer = buffer->Handle(); + impl->hcr_offset = offset; + } + if (impl->hcr_is_set) { + if (impl->hcr_setup.buffer == impl->hcr_buffer && + impl->hcr_setup.offset == impl->hcr_offset) { + ResumeHostConditionalRendering(); + return; + } + PauseHostConditionalRendering(); } + impl->hcr_setup.buffer = impl->hcr_buffer; + impl->hcr_setup.offset = impl->hcr_offset; + impl->hcr_setup.flags = is_equal ? VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT : 0; + impl->hcr_is_set = true; + impl->is_hcr_running = false; + ResumeHostConditionalRendering(); } -QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, const Device& device_, - Scheduler& scheduler_) - : QueryCacheBase{rasterizer_, cpu_memory_}, device{device_}, scheduler{scheduler_}, - query_pools{ - QueryPool{device_, scheduler_, QueryType::SamplesPassed}, - } {} - -QueryCache::~QueryCache() { - // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class - // destructor is called. The query cache should be redesigned to have a proper ownership model - // instead of using shared pointers. - for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) { - auto& stream = Stream(static_cast<QueryType>(query_type)); - stream.Update(false); - stream.Reset(); +void QueryCacheRuntime::HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal) { + VkBuffer to_resolve; + u32 to_resolve_offset; + { + std::scoped_lock lk(impl->buffer_cache.mutex); + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::NoSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = + impl->buffer_cache.ObtainCPUBuffer(address, 24, sync_info, post_op); + to_resolve = buffer->Handle(); + to_resolve_offset = static_cast<u32>(offset); } + if (impl->is_hcr_running) { + PauseHostConditionalRendering(); + } + impl->conditional_resolve_pass->Resolve(*impl->hcr_resolve_buffer, to_resolve, + to_resolve_offset, false); + impl->hcr_setup.buffer = *impl->hcr_resolve_buffer; + impl->hcr_setup.offset = 0; + impl->hcr_setup.flags = is_equal ? 0 : VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; + impl->hcr_is_set = true; + impl->is_hcr_running = false; + ResumeHostConditionalRendering(); } -std::pair<VkQueryPool, u32> QueryCache::AllocateQuery(QueryType type) { - return query_pools[static_cast<std::size_t>(type)].Commit(); +bool QueryCacheRuntime::HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, + [[maybe_unused]] bool qc_dirty) { + if (!impl->device.IsExtConditionalRendering()) { + return false; + } + HostConditionalRenderingCompareValueImpl(object_1, false); + return true; } -void QueryCache::Reserve(QueryType type, std::pair<VkQueryPool, u32> query) { - query_pools[static_cast<std::size_t>(type)].Reserve(query); +bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, + VideoCommon::LookupData object_2, + bool qc_dirty, bool equal_check) { + if (!impl->device.IsExtConditionalRendering()) { + return false; + } + + const auto check_in_bc = [&](VAddr address) { + return impl->buffer_cache.IsRegionGpuModified(address, 8); + }; + const auto check_value = [&](VAddr address) { + u8* ptr = impl->cpu_memory.GetPointer(address); + u64 value{}; + std::memcpy(&value, ptr, sizeof(value)); + return value == 0; + }; + std::array<VideoCommon::LookupData*, 2> objects{&object_1, &object_2}; + std::array<bool, 2> is_in_bc{}; + std::array<bool, 2> is_in_qc{}; + std::array<bool, 2> is_in_ac{}; + std::array<bool, 2> is_null{}; + { + std::scoped_lock lk(impl->buffer_cache.mutex); + for (size_t i = 0; i < 2; i++) { + is_in_qc[i] = objects[i]->found_query != nullptr; + is_in_bc[i] = !is_in_qc[i] && check_in_bc(objects[i]->address); + is_in_ac[i] = is_in_qc[i] || is_in_bc[i]; + } + } + + if (!is_in_ac[0] && !is_in_ac[1]) { + EndHostConditionalRendering(); + return false; + } + + if (!qc_dirty && !is_in_bc[0] && !is_in_bc[1]) { + EndHostConditionalRendering(); + return false; + } + + const bool is_gpu_high = Settings::IsGPULevelHigh(); + if (!is_gpu_high && impl->device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { + return true; + } + + for (size_t i = 0; i < 2; i++) { + is_null[i] = !is_in_ac[i] && check_value(objects[i]->address); + } + + for (size_t i = 0; i < 2; i++) { + if (is_null[i]) { + size_t j = (i + 1) % 2; + HostConditionalRenderingCompareValueImpl(*objects[j], equal_check); + return true; + } + } + + if (!is_gpu_high) { + return true; + } + + if (!is_in_bc[0] && !is_in_bc[1]) { + // Both queries are in query cache, it's best to just flush. + return true; + } + HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); + return true; } -HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, - QueryType type_) - : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, - query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { - const vk::Device* logical = &cache.GetDevice().GetLogical(); - cache.GetScheduler().Record([logical, query_ = query](vk::CommandBuffer cmdbuf) { - const bool use_precise = Settings::IsGPULevelHigh(); - logical->ResetQueryPool(query_.first, query_.second, 1); - cmdbuf.BeginQuery(query_.first, query_.second, - use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); - }); +QueryCacheRuntime::~QueryCacheRuntime() = default; + +VideoCommon::StreamerInterface* QueryCacheRuntime::GetStreamerInterface(QueryType query_type) { + switch (query_type) { + case QueryType::Payload: + return &impl->guest_streamer; + case QueryType::ZPassPixelCount64: + return &impl->sample_streamer; + case QueryType::StreamingByteCount: + return &impl->tfb_streamer; + case QueryType::StreamingPrimitivesNeeded: + case QueryType::VtgPrimitivesOut: + case QueryType::StreamingPrimitivesSucceeded: + return &impl->primitives_succeeded_streamer; + case QueryType::StreamingPrimitivesNeededMinusSucceeded: + return &impl->primitives_needed_minus_suceeded_streamer; + default: + return nullptr; + } } -HostCounter::~HostCounter() { - cache.Reserve(type, query); +void QueryCacheRuntime::Barriers(bool is_prebarrier) { + static constexpr VkMemoryBarrier READ_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + if (is_prebarrier) { + impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); + }); + } else { + impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); + }); + } } -void HostCounter::EndQuery() { - cache.GetScheduler().Record([query_ = query](vk::CommandBuffer cmdbuf) { - cmdbuf.EndQuery(query_.first, query_.second); +template <typename SyncValuesType> +void QueryCacheRuntime::SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer) { + if (values.size() == 0) { + return; + } + impl->redirect_cache.clear(); + impl->little_cache.clear(); + size_t total_size = 0; + for (auto& sync_val : values) { + total_size += sync_val.size; + bool found = false; + VAddr base = Common::AlignDown(sync_val.address, Core::Memory::YUZU_PAGESIZE); + VAddr base_end = base + Core::Memory::YUZU_PAGESIZE; + for (size_t i = 0; i < impl->little_cache.size(); i++) { + const auto set_found = [&] { + impl->redirect_cache.push_back(i); + found = true; + }; + auto& loc = impl->little_cache[i]; + if (base < loc.second && loc.first < base_end) { + set_found(); + break; + } + if (loc.first == base_end) { + loc.first = base; + set_found(); + break; + } + if (loc.second == base) { + loc.second = base_end; + set_found(); + break; + } + } + if (!found) { + impl->redirect_cache.push_back(impl->little_cache.size()); + impl->little_cache.emplace_back(base, base_end); + } + } + + // Vulkan part. + std::scoped_lock lk(impl->buffer_cache.mutex); + impl->buffer_cache.BufferOperations([&] { + impl->buffers_to_upload_to.clear(); + for (auto& pair : impl->little_cache) { + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = impl->buffer_cache.ObtainCPUBuffer( + pair.first, static_cast<u32>(pair.second - pair.first), sync_info, post_op); + impl->buffers_to_upload_to.emplace_back(buffer->Handle(), offset); + } }); -} -u64 HostCounter::BlockingQuery(bool async) const { - if (!async) { - cache.GetScheduler().Wait(tick); - } - u64 data; - const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( - query.first, query.second, 1, sizeof(data), &data, sizeof(data), - VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); - - switch (query_result) { - case VK_SUCCESS: - return data; - case VK_ERROR_DEVICE_LOST: - cache.GetDevice().ReportLoss(); - [[fallthrough]]; - default: - throw vk::Exception(query_result); + VkBuffer src_buffer; + [[maybe_unused]] StagingBufferRef ref; + impl->copies_setup.clear(); + impl->copies_setup.resize(impl->little_cache.size()); + if constexpr (SyncValuesType::GeneratesBaseBuffer) { + ref = impl->staging_pool.Request(total_size, MemoryUsage::Upload); + size_t current_offset = ref.offset; + size_t accumulated_size = 0; + for (size_t i = 0; i < values.size(); i++) { + size_t which_copy = impl->redirect_cache[i]; + impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ + .srcOffset = current_offset + accumulated_size, + .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - + impl->little_cache[which_copy].first, + .size = values[i].size, + }); + std::memcpy(ref.mapped_span.data() + accumulated_size, &values[i].value, + values[i].size); + accumulated_size += values[i].size; + } + src_buffer = ref.buffer; + } else { + for (size_t i = 0; i < values.size(); i++) { + size_t which_copy = impl->redirect_cache[i]; + impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ + .srcOffset = values[i].offset, + .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - + impl->little_cache[which_copy].first, + .size = values[i].size, + }); + } + src_buffer = base_src_buffer; } + + impl->scheduler.RequestOutsideRenderPassOperationContext(); + impl->scheduler.Record([src_buffer, dst_buffers = std::move(impl->buffers_to_upload_to), + vk_copies = std::move(impl->copies_setup)](vk::CommandBuffer cmdbuf) { + size_t size = dst_buffers.size(); + for (size_t i = 0; i < size; i++) { + cmdbuf.CopyBuffer(src_buffer, dst_buffers[i].first, vk_copies[i]); + } + }); } } // namespace Vulkan + +namespace VideoCommon { + +template class QueryCacheBase<Vulkan::QueryCacheParams>; + +} // namespace VideoCommon diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index c1b9552eb..e9a1ea169 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -1,101 +1,75 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once -#include <cstddef> #include <memory> -#include <utility> -#include <vector> -#include "common/common_types.h" -#include "video_core/query_cache.h" -#include "video_core/renderer_vulkan/vk_resource_pool.h" -#include "video_core/vulkan_common/vulkan_wrapper.h" +#include "video_core/query_cache/query_cache_base.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" namespace VideoCore { class RasterizerInterface; } +namespace VideoCommon { +class StreamerInterface; +} + namespace Vulkan { -class CachedQuery; class Device; -class HostCounter; -class QueryCache; class Scheduler; +class StagingBufferPool; -using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; +struct QueryCacheRuntimeImpl; -class QueryPool final : public ResourcePool { +class QueryCacheRuntime { public: - explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type); - ~QueryPool() override; + explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, + Core::Memory::Memory& cpu_memory_, + Vulkan::BufferCache& buffer_cache_, const Device& device_, + const MemoryAllocator& memory_allocator_, Scheduler& scheduler_, + StagingBufferPool& staging_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool); + ~QueryCacheRuntime(); - std::pair<VkQueryPool, u32> Commit(); + template <typename SyncValuesType> + void SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer = nullptr); - void Reserve(std::pair<VkQueryPool, u32> query); + void Barriers(bool is_prebarrier); -protected: - void Allocate(std::size_t begin, std::size_t end) override; + void EndHostConditionalRendering(); -private: - static constexpr std::size_t GROW_STEP = 512; + void PauseHostConditionalRendering(); - const Device& device; - const VideoCore::QueryType type; + void ResumeHostConditionalRendering(); - std::vector<vk::QueryPool> pools; - std::vector<bool> usage; -}; + bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty); -class QueryCache final - : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { -public: - explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, const Device& device_, - Scheduler& scheduler_); - ~QueryCache(); - - std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type); + bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, + VideoCommon::LookupData object_2, bool qc_dirty, + bool equal_check); - void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query); + VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type); - const Device& GetDevice() const noexcept { - return device; - } + void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d); - Scheduler& GetScheduler() const noexcept { - return scheduler; - } + template <typename Func> + void View3DRegs(Func&& func); private: - const Device& device; - Scheduler& scheduler; - std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; + void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal); + void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal); + friend struct QueryCacheRuntimeImpl; + std::unique_ptr<QueryCacheRuntimeImpl> impl; }; -class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { -public: - explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, - VideoCore::QueryType type_); - ~HostCounter(); - - void EndQuery(); - -private: - u64 BlockingQuery(bool async = false) const override; - - QueryCache& cache; - const VideoCore::QueryType type; - const std::pair<VkQueryPool, u32> query; - const u64 tick; +struct QueryCacheParams { + using RuntimeType = typename Vulkan::QueryCacheRuntime; }; -class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { -public: - explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_) - : CachedQueryBase{cpu_addr_, host_ptr_} {} -}; +using QueryCache = VideoCommon::QueryCacheBase<QueryCacheParams>; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 01e76a82c..c7ce7c312 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -24,6 +24,7 @@ #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -170,9 +171,11 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), buffer_cache(*this, cpu_memory_, buffer_cache_runtime), + query_cache_runtime(this, cpu_memory_, buffer_cache, device, memory_allocator, scheduler, + staging_pool, compute_pass_descriptor_queue, descriptor_pool), + query_cache(gpu, *this, cpu_memory_, query_cache_runtime), pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), - query_cache{*this, cpu_memory_, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), wfi_event(device.GetLogical().CreateEvent()) { @@ -189,14 +192,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { FlushWork(); gpu_memory->FlushCaching(); -#if ANDROID - if (Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - query_cache.UpdateCounters(); - } -#else - query_cache.UpdateCounters(); -#endif + query_cache.NotifySegment(true); GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; if (!pipeline) { @@ -207,13 +203,12 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); - BeginTransformFeedback(); - UpdateDynamicStates(); + HandleTransformFeedback(); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, + maxwell3d->regs.zpass_pixel_count_enable); draw_func(); - - EndTransformFeedback(); } void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { @@ -241,6 +236,14 @@ void RasterizerVulkan::DrawIndirect() { const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); const auto& buffer = indirect_buffer.first; const auto& offset = indirect_buffer.second; + if (params.is_byte_count) { + scheduler.Record([buffer_obj = buffer->Handle(), offset, + stride = params.stride](vk::CommandBuffer cmdbuf) { + cmdbuf.DrawIndirectByteCountEXT(1, 0, buffer_obj, offset, 0, + static_cast<u32>(stride)); + }); + return; + } if (params.include_count) { const auto count = buffer_cache.GetDrawIndirectCount(); const auto& draw_buffer = count.first; @@ -280,20 +283,15 @@ void RasterizerVulkan::DrawTexture() { SCOPE_EXIT({ gpu.TickWork(); }); FlushWork(); -#if ANDROID - if (Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - query_cache.UpdateCounters(); - } -#else - query_cache.UpdateCounters(); -#endif + query_cache.NotifySegment(true); texture_cache.SynchronizeGraphicsDescriptors(); texture_cache.UpdateRenderTargets(false); UpdateDynamicStates(); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, + maxwell3d->regs.zpass_pixel_count_enable); const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); @@ -316,14 +314,9 @@ void RasterizerVulkan::Clear(u32 layer_count) { FlushWork(); gpu_memory->FlushCaching(); -#if ANDROID - if (Settings::IsGPULevelHigh()) { - // This is problematic on Android, disable on GPU Normal. - query_cache.UpdateCounters(); - } -#else - query_cache.UpdateCounters(); -#endif + query_cache.NotifySegment(true); + query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, + maxwell3d->regs.zpass_pixel_count_enable); auto& regs = maxwell3d->regs; const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || @@ -482,13 +475,13 @@ void RasterizerVulkan::DispatchCompute() { scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); } -void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { - query_cache.ResetCounter(type); +void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) { + query_cache.CounterReset(type); } -void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, - std::optional<u64> timestamp) { - query_cache.Query(gpu_addr, type, timestamp); +void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { + query_cache.CounterReport(gpu_addr, type, flags, payload, subreport); } void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -669,8 +662,8 @@ void RasterizerVulkan::SignalReference() { fence_manager.SignalReference(); } -void RasterizerVulkan::ReleaseFences() { - fence_manager.WaitPendingFences(); +void RasterizerVulkan::ReleaseFences(bool force) { + fence_manager.WaitPendingFences(force); } void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, @@ -694,6 +687,8 @@ void RasterizerVulkan::WaitForIdle() { flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; } + query_cache.NotifyWFI(); + scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { cmdbuf.SetEvent(event, flags); @@ -737,19 +732,7 @@ void RasterizerVulkan::TickFrame() { bool RasterizerVulkan::AccelerateConditionalRendering() { gpu_memory->FlushCaching(); - if (Settings::IsGPULevelHigh()) { - // TODO(Blinkhawk): Reimplement Host conditional rendering. - return false; - } - // Medium / Low Hack: stub any checks on queries written into the buffer cache. - const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; - Maxwell::ReportSemaphore::Compare cmp; - if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), - VideoCommon::CacheType::BufferCache | - VideoCommon::CacheType::QueryCache)) { - return true; - } - return false; + return query_cache.AccelerateHostConditionalRendering(); } bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, @@ -795,6 +778,7 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, if (!image_view) { return false; } + query_cache.NotifySegment(false); screen_info.image = image_view->ImageHandle(); screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); screen_info.width = image_view->size.width; @@ -933,31 +917,18 @@ void RasterizerVulkan::UpdateDynamicStates() { } } -void RasterizerVulkan::BeginTransformFeedback() { +void RasterizerVulkan::HandleTransformFeedback() { const auto& regs = maxwell3d->regs; - if (regs.transform_feedback_enabled == 0) { - return; - } if (!device.IsExtTransformFeedbackSupported()) { LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); return; } - UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || - regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); - scheduler.Record( - [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); -} - -void RasterizerVulkan::EndTransformFeedback() { - const auto& regs = maxwell3d->regs; - if (regs.transform_feedback_enabled == 0) { - return; - } - if (!device.IsExtTransformFeedbackSupported()) { - return; + query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount, + regs.transform_feedback_enabled); + if (regs.transform_feedback_enabled != 0) { + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || + regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); } - scheduler.Record( - [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index b31982485..ad069556c 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -84,8 +84,9 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCore::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; + void ResetCounter(VideoCommon::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, + VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -106,7 +107,7 @@ public: void SyncOperation(std::function<void()>&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences() override; + void ReleaseFences(bool force = true) override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; @@ -146,9 +147,7 @@ private: void UpdateDynamicStates(); - void BeginTransformFeedback(); - - void EndTransformFeedback(); + void HandleTransformFeedback(); void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); @@ -195,8 +194,9 @@ private: TextureCache texture_cache; BufferCacheRuntime buffer_cache_runtime; BufferCache buffer_cache; - PipelineCache pipeline_cache; + QueryCacheRuntime query_cache_runtime; QueryCache query_cache; + PipelineCache pipeline_cache; AccelerateDMA accelerate_dma; FenceManager fence_manager; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 89fd31b4f..3be7837f4 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -243,10 +243,10 @@ void Scheduler::AllocateNewContext() { #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache->UpdateCounters(); + query_cache->NotifySegment(true); } #else - query_cache->UpdateCounters(); + query_cache->NotifySegment(true); #endif } } @@ -261,11 +261,12 @@ void Scheduler::EndPendingOperations() { #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache->DisableStreams(); + // query_cache->DisableStreams(); } #else - query_cache->DisableStreams(); + // query_cache->DisableStreams(); #endif + query_cache->NotifySegment(false); EndRenderPass(); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 475c682eb..da03803aa 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -17,6 +17,11 @@ #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +namespace VideoCommon { +template <typename Trait> +class QueryCacheBase; +} + namespace Vulkan { class CommandPool; @@ -24,7 +29,8 @@ class Device; class Framebuffer; class GraphicsPipeline; class StateTracker; -class QueryCache; + +struct QueryCacheParams; /// The scheduler abstracts command buffer and fence management with an interface that's able to do /// OpenGL-like operations on Vulkan command buffers. @@ -63,7 +69,7 @@ public: void InvalidateState(); /// Assigns the query cache. - void SetQueryCache(QueryCache& query_cache_) { + void SetQueryCache(VideoCommon::QueryCacheBase<QueryCacheParams>& query_cache_) { query_cache = &query_cache_; } @@ -219,7 +225,7 @@ private: std::unique_ptr<MasterSemaphore> master_semaphore; std::unique_ptr<CommandPool> command_pool; - QueryCache* query_cache = nullptr; + VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr; vk::CommandBuffer current_cmdbuf; |