diff options
Diffstat (limited to 'src/video_core/renderer_vulkan')
-rw-r--r-- | src/video_core/renderer_vulkan/vk_compute_pass.cpp | 101 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_compute_pass.h | 2 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_query_cache.cpp | 13 |
3 files changed, 66 insertions, 50 deletions
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 44ec5a032..289d5b25c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -179,8 +179,10 @@ struct AstcPushConstants { }; struct QueriesPrefixScanPushConstants { + u32 min_accumulation_base; u32 max_accumulation_base; u32 accumulation_limit; + u32 buffer_offset; }; } // Anonymous namespace @@ -416,56 +418,65 @@ QueriesPrefixScanPass::QueriesPrefixScanPass( device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) - : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV), - {32}), + : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)), scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, size_t number_of_sums, - size_t max_accumulation_limit) { - size_t aligned_runs = Common::AlignUp(number_of_sums, 32); - - compute_pass_descriptor_queue.Acquire(); - compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64)); - compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64)); - compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); - const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; - - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums, - aligned_runs](vk::CommandBuffer cmdbuf) { - static constexpr VkMemoryBarrier read_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, - }; - static constexpr VkMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | - VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | - VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | - VK_ACCESS_UNIFORM_READ_BIT | - VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, - }; - const QueriesPrefixScanPushConstants uniforms{ - .max_accumulation_base = static_cast<u32>(max_accumulation_limit), - .accumulation_limit = static_cast<u32>(number_of_sums - 1), - }; - const VkDescriptorSet set = descriptor_allocator.Commit(); - device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + size_t min_accumulation_limit, size_t max_accumulation_limit) { + size_t current_runs = number_of_sums; + size_t offset = 0; + while (current_runs != 0) { + static constexpr size_t DISPATCH_SIZE = 2048U; + size_t runs_to_do = std::min<size_t>(current_runs, DISPATCH_SIZE); + current_runs -= runs_to_do; + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + size_t used_offset = offset; + offset += runs_to_do; + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit, + runs_to_do, used_offset](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | + VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | + VK_ACCESS_UNIFORM_READ_BIT | + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const QueriesPrefixScanPushConstants uniforms{ + .min_accumulation_base = static_cast<u32>(min_accumulation_limit), + .max_accumulation_base = static_cast<u32>(max_accumulation_limit), + .accumulation_limit = static_cast<u32>(runs_to_do - 1), + .buffer_offset = static_cast<u32>(used_offset), + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); - cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); - cmdbuf.Dispatch(static_cast<u32>(aligned_runs / 32U), 1, 1); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); - }); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); + cmdbuf.Dispatch(1, 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, + write_barrier); + }); + } } ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 68ffb1b82..3ff935639 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -104,7 +104,7 @@ public: ComputePassDescriptorQueue& compute_pass_descriptor_queue_); void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, - size_t number_of_sums, size_t max_accumulation_limit); + size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit); private: Scheduler& scheduler; diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 2cc007716..a32da3ba3 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -181,7 +181,8 @@ public: }); rasterizer->SyncOperation(std::move(func)); accumulation_since_last_sync = false; - last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used); + first_accumulation_checkpoint = std::min(first_accumulation_checkpoint, num_slots_used); + last_accumulation_checkpoint = std::max(last_accumulation_checkpoint, num_slots_used); } void CloseCounter() override { @@ -285,7 +286,9 @@ public: resolve_buffers.push_back(intermediary_buffer_index); queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], *buffers[resolve_buffer_index], num_slots_used, - std::min(last_accumulation_checkpoint, num_slots_used)); + std::min(first_accumulation_checkpoint, num_slots_used), + last_accumulation_checkpoint); + } else { scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { @@ -298,7 +301,8 @@ public: rasterizer->SyncOperation(std::move(func)); AbandonCurrentQuery(); num_slots_used = 0; - last_accumulation_checkpoint = std::numeric_limits<size_t>::max(); + first_accumulation_checkpoint = std::numeric_limits<size_t>::max(); + last_accumulation_checkpoint = 0; accumulation_since_last_sync = has_multi_queries; pending_sync.clear(); } @@ -506,7 +510,7 @@ private: template <bool is_resolve> size_t ObtainBuffer(size_t num_needed) { - const size_t log_2 = std::max<size_t>(6U, Common::Log2Ceil64(num_needed)); + const size_t log_2 = std::max<size_t>(11U, Common::Log2Ceil64(num_needed)); if constexpr (is_resolve) { if (resolve_table[log_2] != 0) { return resolve_table[log_2] - 1; @@ -563,6 +567,7 @@ private: VkQueryPool current_query_pool; size_t current_query_id; size_t num_slots_used{}; + size_t first_accumulation_checkpoint{}; size_t last_accumulation_checkpoint{}; bool accumulation_since_last_sync{}; VideoCommon::HostQueryBase* current_query; |