From c8237d5c312485394389b2520451ef720604ea9a Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 20 Aug 2023 17:53:08 +0200 Subject: Query Cache: Implement host side sample counting. --- src/video_core/renderer_vulkan/vk_compute_pass.cpp | 110 ++++++++++++++- src/video_core/renderer_vulkan/vk_compute_pass.h | 14 ++ src/video_core/renderer_vulkan/vk_query_cache.cpp | 147 ++++++++++++++------- 3 files changed, 223 insertions(+), 48 deletions(-) (limited to 'src/video_core/renderer_vulkan') diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 039dc95e1..a1af08cda 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -12,6 +12,7 @@ #include "common/common_types.h" #include "common/div_ceil.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h" +#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" @@ -58,6 +59,30 @@ constexpr std::array INPUT_OUTPUT_DESCRIPTOR_SE }, }}; +constexpr std::array QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 2, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, +}}; + constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .uniform_buffers = 0, .storage_buffers = 2, @@ -68,6 +93,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .score = 2, }; +constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{ + .uniform_buffers = 0, + .storage_buffers = 3, + .texture_buffers = 0, + .image_buffers = 0, + .textures = 0, + .images = 0, + .score = 3, +}; + constexpr std::array ASTC_DESCRIPTOR_SET_BINDINGS{{ { .binding = ASTC_BINDING_INPUT_BUFFER, @@ -104,6 +139,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT .stride = sizeof(DescriptorUpdateEntry), }; +constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{ + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 3, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 0, + .stride = sizeof(DescriptorUpdateEntry), +}; + constexpr std::array ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ { @@ -132,6 +176,11 @@ struct AstcPushConstants { u32 block_height; u32 block_height_mask; }; + +struct QueriesPrefixScanPushConstants { + u32 max_accumulation_base; + u32 accumulation_limit; +}; } // Anonymous namespace ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, @@ -313,8 +362,6 @@ ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero) { - scheduler.RequestOutsideRenderPassOperationContext(); - const size_t compare_size = compare_to_zero ? 8 : 24; compute_pass_descriptor_queue.Acquire(); @@ -327,7 +374,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ static constexpr VkMemoryBarrier read_barrier{ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, .pNext = nullptr, - .srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, }; static constexpr VkMemoryBarrier write_barrier{ @@ -349,6 +396,63 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ }); } +QueriesPrefixScanPass::QueriesPrefixScanPass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, + QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, + COMPUTE_PUSH_CONSTANT_RANGE, + QUERIES_PREFIX_SCAN_SUM_COMP_SPV), + scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} + +void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, + VkBuffer src_buffer, size_t number_of_sums, + size_t max_accumulation_limit) { + size_t aligned_runs = Common::AlignUp(number_of_sums, 32); + + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums, + aligned_runs](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | + VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | + VK_ACCESS_UNIFORM_READ_BIT | + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const QueriesPrefixScanPushConstants uniforms{ + .max_accumulation_base = static_cast(max_accumulation_limit), + .accumulation_limit = static_cast(number_of_sums - 1), + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); + cmdbuf.Dispatch(static_cast(aligned_runs / 32U), 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); + }); +} + ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index c62f30d30..e6ff86e9a 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -95,6 +95,20 @@ private: ComputePassDescriptorQueue& compute_pass_descriptor_queue; }; +class QueriesPrefixScanPass final : public ComputePass { +public: + explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + + void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, + size_t number_of_sums, size_t max_accumulation_limit); + +private: + Scheduler& scheduler; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; +}; + class ASTCDecoderPass final : public ComputePass { public: explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 2147776f8..ded190ae0 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -11,6 +11,7 @@ #include #include +#include "common/bit_util.h" #include "common/common_types.h" #include "core/memory.h" #include "video_core/engines/draw_manager.h" @@ -112,14 +113,34 @@ class SamplesStreamer : public BaseStreamer { public: explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, VideoCore::RasterizerInterface* rasterizer_, const Device& device_, - Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_} { - BuildResolveBuffer(); current_bank = nullptr; current_query = nullptr; ammend_value = 0; acumulation_value = 0; + queries_prefix_scan_pass = std::make_unique( + device, scheduler, descriptor_pool, compute_pass_descriptor_queue); + + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = 8, + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); + }); } ~SamplesStreamer() = default; @@ -159,6 +180,8 @@ public: acumulation_value = 0; }); rasterizer->SyncOperation(std::move(func)); + accumulation_since_last_sync = false; + last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used); } void CloseCounter() override { @@ -175,7 +198,8 @@ public: } for (size_t i = 0; i < sync_values_stash.size(); i++) { - runtime.template SyncValues(sync_values_stash[i], *resolve_buffers[i]); + runtime.template SyncValues(sync_values_stash[i], + *buffers[resolve_buffers[i]]); } sync_values_stash.clear(); @@ -189,36 +213,21 @@ public: sync_values_stash.clear(); sync_values_stash.emplace_back(); std::vector* sync_values = &sync_values_stash.back(); - sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); + sync_values->reserve(num_slots_used); std::unordered_map> offsets; - size_t this_bank_slot = std::numeric_limits::max(); - size_t resolve_slots_remaining = resolve_slots; - size_t resolve_buffer_index = 0; + resolve_buffers.clear(); + size_t resolve_buffer_index = ObtainBuffer(num_slots_used); + resolve_buffers.push_back(resolve_buffer_index); + size_t base_offset = 0; + ApplyBanksWideOp(pending_sync, [&](SamplesQueryBank* bank, size_t start, size_t amount) { size_t bank_id = bank->GetIndex(); - if (this_bank_slot != bank_id) { - this_bank_slot = bank_id; - if (resolve_slots_remaining == 0) { - resolve_buffer_index++; - if (resolve_buffer_index >= resolve_buffers.size()) { - BuildResolveBuffer(); - } - resolve_slots_remaining = resolve_slots; - sync_values_stash.emplace_back(); - sync_values = &sync_values_stash.back(); - sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); - } - resolve_slots_remaining--; - } - auto& resolve_buffer = resolve_buffers[resolve_buffer_index]; - const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * - (resolve_slots - resolve_slots_remaining - 1); + auto& resolve_buffer = buffers[resolve_buffer_index]; VkQueryPool query_pool = bank->GetInnerPool(); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([start, amount, base_offset, query_pool, buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { - size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE; const VkBufferMemoryBarrier copy_query_pool_barrier{ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, .pNext = nullptr, @@ -227,39 +236,60 @@ public: .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .buffer = buffer, - .offset = final_offset, + .offset = base_offset, .size = amount * SamplesQueryBank::QUERY_SIZE, }; cmdbuf.CopyQueryPoolResults( query_pool, static_cast(start), static_cast(amount), buffer, - static_cast(final_offset), SamplesQueryBank::QUERY_SIZE, + static_cast(base_offset), SamplesQueryBank::QUERY_SIZE, VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); }); - offsets[bank_id] = {sync_values_stash.size() - 1, base_offset}; + offsets[bank_id] = {start, base_offset}; + base_offset += amount * SamplesQueryBank::QUERY_SIZE; }); // Convert queries + bool has_multi_queries = false; for (auto q : pending_sync) { auto* query = GetQuery(q); + size_t sync_value_slot = 0; if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { continue; } if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { continue; } - if (query->size_slots > 1) { - // This is problematic. - // UNIMPLEMENTED(); + if (accumulation_since_last_sync || query->size_slots > 1) { + if (!has_multi_queries) { + has_multi_queries = true; + sync_values_stash.emplace_back(); + } + sync_value_slot = 1; } query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; auto loc_data = offsets[query->start_bank_id]; - sync_values_stash[loc_data.first].emplace_back(HostSyncValues{ + sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{ .address = query->guest_address, .size = SamplesQueryBank::QUERY_SIZE, - .offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE, + .offset = + loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) * + SamplesQueryBank::QUERY_SIZE, + }); + } + + if (has_multi_queries) { + size_t intermediary_buffer_index = ObtainBuffer(num_slots_used); + resolve_buffers.push_back(intermediary_buffer_index); + queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], + *buffers[resolve_buffer_index], num_slots_used, + std::min(last_accumulation_checkpoint, num_slots_used)); + } else { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); }); } @@ -267,6 +297,9 @@ public: std::function func([this] { ammend_value = acumulation_value; }); rasterizer->SyncOperation(std::move(func)); AbandonCurrentQuery(); + num_slots_used = 0; + last_accumulation_checkpoint = std::numeric_limits::max(); + accumulation_since_last_sync = has_multi_queries; pending_sync.clear(); } @@ -400,6 +433,7 @@ private: void ReserveHostQuery() { size_t new_slot = ReserveBankSlot(); current_bank->AddReference(1); + num_slots_used++; if (current_query) { size_t bank_id = current_query->start_bank_id; size_t banks_set = current_query->size_banks - 1; @@ -470,32 +504,50 @@ private: }); } - void BuildResolveBuffer() { + template + size_t ObtainBuffer(size_t num_needed) { + const size_t log_2 = std::max(6U, Common::Log2Ceil64(num_needed)); + if constexpr (is_resolve) { + if (resolve_table[log_2] != 0) { + return resolve_table[log_2] - 1; + } + } else { + if (intermediary_table[log_2] != 0) { + return intermediary_table[log_2] - 1; + } + } const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots, + .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2), .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; - resolve_buffers.emplace_back( - memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); + buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); + if constexpr (is_resolve) { + resolve_table[log_2] = buffers.size(); + } else { + intermediary_table[log_2] = buffers.size(); + } + return buffers.size() - 1; } - static constexpr size_t resolve_slots = 8; - QueryCacheRuntime& runtime; VideoCore::RasterizerInterface* rasterizer; const Device& device; Scheduler& scheduler; const MemoryAllocator& memory_allocator; VideoCommon::BankPool bank_pool; - std::deque resolve_buffers; + std::deque buffers; + std::array resolve_table{}; + std::array intermediary_table{}; + vk::Buffer accumulation_buffer; std::deque> sync_values_stash; + std::vector resolve_buffers; // syncing queue std::vector pending_sync; @@ -510,10 +562,14 @@ private: SamplesQueryBank* current_bank; VkQueryPool current_query_pool; size_t current_query_id; + size_t num_slots_used{}; + size_t last_accumulation_checkpoint{}; + bool accumulation_since_last_sync{}; VideoCommon::HostQueryBase* current_query; bool has_started{}; - bool current_unset{}; std::mutex flush_guard; + + std::unique_ptr queries_prefix_scan_pass; }; // Transform feedback queries @@ -1090,7 +1146,8 @@ struct QueryCacheRuntimeImpl { memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, guest_streamer(0, runtime), sample_streamer(static_cast(QueryType::ZPassPixelCount64), runtime, rasterizer, - device, scheduler, memory_allocator), + device, scheduler, memory_allocator, compute_pass_descriptor_queue, + descriptor_pool), tfb_streamer(static_cast(QueryType::StreamingByteCount), runtime, device, scheduler, memory_allocator, staging_pool), primitives_succeeded_streamer( @@ -1319,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku return true; } } - if (!is_in_bc[0] && !is_in_bc[1]) { + /*if (!is_in_bc[0] && !is_in_bc[1]) { // Both queries are in query cache, it's best to just flush. - return false; - } + return true; + }*/ HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); return true; } -- cgit v1.2.3