From 65d4a16afd7d84742751d3f9b3b0976738cac136 Mon Sep 17 00:00:00 2001 From: Liam Date: Fri, 27 Oct 2023 23:48:55 -0400 Subject: renderer_vulkan: fix cropping for presentation --- src/video_core/renderer_vulkan/vk_blit_screen.cpp | 101 ++++++++++++---------- 1 file changed, 55 insertions(+), 46 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 52fc142d1..459ab32c2 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -1395,63 +1395,72 @@ void BlitScreen::SetUniformData(BufferData& data, const Layout::FramebufferLayou MakeOrthographicMatrix(static_cast(layout.width), static_cast(layout.height)); } -void BlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, - const Layout::FramebufferLayout layout) const { - const auto& framebuffer_transform_flags = framebuffer.transform_flags; - const auto& framebuffer_crop_rect = framebuffer.crop_rect; - - static constexpr Common::Rectangle texcoords{0.f, 0.f, 1.f, 1.f}; - auto left = texcoords.left; - auto right = texcoords.right; - - switch (framebuffer_transform_flags) { - case Service::android::BufferTransformFlags::Unset: - break; - case Service::android::BufferTransformFlags::FlipV: - // Flip the framebuffer vertically - left = texcoords.right; - right = texcoords.left; - break; - default: - UNIMPLEMENTED_MSG("Unsupported framebuffer_transform_flags={}", - static_cast(framebuffer_transform_flags)); - break; +static Common::Rectangle NormalizeCrop(Common::Rectangle crop, + const Tegra::FramebufferConfig& framebuffer) { + f32 left, top, right, bottom; + + if (!crop.IsEmpty()) { + // If crop rectangle is not empty, apply properties from rectangle. + left = static_cast(crop.left); + top = static_cast(crop.top); + right = static_cast(crop.right); + bottom = static_cast(crop.bottom); + } else { + // Otherwise, fall back to framebuffer dimensions. + left = 0; + top = 0; + right = static_cast(framebuffer.width); + bottom = static_cast(framebuffer.height); } - UNIMPLEMENTED_IF(framebuffer_crop_rect.left != 0); + // Apply transformation flags. + auto framebuffer_transform_flags = framebuffer.transform_flags; - f32 left_start{}; - if (framebuffer_crop_rect.Top() > 0) { - left_start = static_cast(framebuffer_crop_rect.Top()) / - static_cast(framebuffer_crop_rect.Bottom()); + if (True(framebuffer_transform_flags & Service::android::BufferTransformFlags::FlipH)) { + // Switch left and right. + std::swap(left, right); } - f32 scale_u = static_cast(framebuffer.width) / static_cast(screen_info.width); - f32 scale_v = static_cast(framebuffer.height) / static_cast(screen_info.height); - // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering - // (e.g. handheld mode) on a 1920x1080 framebuffer. - if (!fsr) { - if (framebuffer_crop_rect.GetWidth() > 0) { - scale_u = static_cast(framebuffer_crop_rect.GetWidth()) / - static_cast(screen_info.width); - } - if (framebuffer_crop_rect.GetHeight() > 0) { - scale_v = static_cast(framebuffer_crop_rect.GetHeight()) / - static_cast(screen_info.height); - } + if (True(framebuffer_transform_flags & Service::android::BufferTransformFlags::FlipV)) { + // Switch top and bottom. + std::swap(top, bottom); + } + + framebuffer_transform_flags &= ~Service::android::BufferTransformFlags::FlipH; + framebuffer_transform_flags &= ~Service::android::BufferTransformFlags::FlipV; + if (True(framebuffer_transform_flags)) { + UNIMPLEMENTED_MSG("Unsupported framebuffer_transform_flags={}", + static_cast(framebuffer_transform_flags)); } + return Common::Rectangle(left, top, right, bottom); +} + +void BlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, + const Layout::FramebufferLayout layout) const { + // Get the normalized crop rectangle. + const auto crop = NormalizeCrop(framebuffer.crop_rect, framebuffer); + + // Get the screen properties. + const f32 screen_width = static_cast(screen_info.width); + const f32 screen_height = static_cast(screen_info.height); + + // Apply the crop. + const f32 left = crop.left / screen_width; + const f32 top = crop.top / screen_height; + const f32 right = crop.right / screen_width; + const f32 bottom = crop.bottom / screen_height; + + // Map the coordinates to the screen. const auto& screen = layout.screen; const auto x = static_cast(screen.left); const auto y = static_cast(screen.top); const auto w = static_cast(screen.GetWidth()); const auto h = static_cast(screen.GetHeight()); - data.vertices[0] = ScreenRectVertex(x, y, texcoords.top * scale_u, left_start + left * scale_v); - data.vertices[1] = - ScreenRectVertex(x + w, y, texcoords.bottom * scale_u, left_start + left * scale_v); - data.vertices[2] = - ScreenRectVertex(x, y + h, texcoords.top * scale_u, left_start + right * scale_v); - data.vertices[3] = - ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, left_start + right * scale_v); + + data.vertices[0] = ScreenRectVertex(x, y, left, top); + data.vertices[1] = ScreenRectVertex(x + w, y, right, top); + data.vertices[2] = ScreenRectVertex(x, y + h, left, bottom); + data.vertices[3] = ScreenRectVertex(x + w, y + h, right, bottom); } void BlitScreen::CreateSMAA(VkExtent2D smaa_size) { -- cgit v1.2.3 From 6513a356f07e64aacd66e792cf88e4cedc62cbcb Mon Sep 17 00:00:00 2001 From: Liam Date: Sat, 28 Oct 2023 11:40:02 -0400 Subject: renderer_vulkan: fix FSR cropping --- src/video_core/renderer_vulkan/vk_blit_screen.cpp | 130 ++++++++++++---------- src/video_core/renderer_vulkan/vk_fsr.cpp | 24 ++-- src/video_core/renderer_vulkan/vk_fsr.h | 2 +- 3 files changed, 86 insertions(+), 70 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 459ab32c2..66483a900 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -137,6 +137,56 @@ BlitScreen::BlitScreen(Core::Memory::Memory& cpu_memory_, Core::Frontend::EmuWin BlitScreen::~BlitScreen() = default; +static Common::Rectangle NormalizeCrop(const Tegra::FramebufferConfig& framebuffer, + const ScreenInfo& screen_info) { + f32 left, top, right, bottom; + + if (!framebuffer.crop_rect.IsEmpty()) { + // If crop rectangle is not empty, apply properties from rectangle. + left = static_cast(framebuffer.crop_rect.left); + top = static_cast(framebuffer.crop_rect.top); + right = static_cast(framebuffer.crop_rect.right); + bottom = static_cast(framebuffer.crop_rect.bottom); + } else { + // Otherwise, fall back to framebuffer dimensions. + left = 0; + top = 0; + right = static_cast(framebuffer.width); + bottom = static_cast(framebuffer.height); + } + + // Apply transformation flags. + auto framebuffer_transform_flags = framebuffer.transform_flags; + + if (True(framebuffer_transform_flags & Service::android::BufferTransformFlags::FlipH)) { + // Switch left and right. + std::swap(left, right); + } + if (True(framebuffer_transform_flags & Service::android::BufferTransformFlags::FlipV)) { + // Switch top and bottom. + std::swap(top, bottom); + } + + framebuffer_transform_flags &= ~Service::android::BufferTransformFlags::FlipH; + framebuffer_transform_flags &= ~Service::android::BufferTransformFlags::FlipV; + if (True(framebuffer_transform_flags)) { + UNIMPLEMENTED_MSG("Unsupported framebuffer_transform_flags={}", + static_cast(framebuffer_transform_flags)); + } + + // Get the screen properties. + const f32 screen_width = static_cast(screen_info.width); + const f32 screen_height = static_cast(screen_info.height); + + // Normalize coordinate space. + left /= screen_width; + top /= screen_height; + right /= screen_width; + bottom /= screen_height; + + return Common::Rectangle(left, top, right, bottom); +} + void BlitScreen::Recreate() { present_manager.WaitPresent(); scheduler.Finish(); @@ -354,17 +404,10 @@ void BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, source_image_view = smaa->Draw(scheduler, image_index, source_image, source_image_view); } if (fsr) { - auto crop_rect = framebuffer.crop_rect; - if (crop_rect.GetWidth() == 0) { - crop_rect.right = framebuffer.width; - } - if (crop_rect.GetHeight() == 0) { - crop_rect.bottom = framebuffer.height; - } - crop_rect = crop_rect.Scale(Settings::values.resolution_info.up_factor); - VkExtent2D fsr_input_size{ - .width = Settings::values.resolution_info.ScaleUp(framebuffer.width), - .height = Settings::values.resolution_info.ScaleUp(framebuffer.height), + const auto crop_rect = NormalizeCrop(framebuffer, screen_info); + const VkExtent2D fsr_input_size{ + .width = Settings::values.resolution_info.ScaleUp(screen_info.width), + .height = Settings::values.resolution_info.ScaleUp(screen_info.height), }; VkImageView fsr_image_view = fsr->Draw(scheduler, image_index, source_image_view, fsr_input_size, crop_rect); @@ -1395,61 +1438,28 @@ void BlitScreen::SetUniformData(BufferData& data, const Layout::FramebufferLayou MakeOrthographicMatrix(static_cast(layout.width), static_cast(layout.height)); } -static Common::Rectangle NormalizeCrop(Common::Rectangle crop, - const Tegra::FramebufferConfig& framebuffer) { +void BlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, + const Layout::FramebufferLayout layout) const { f32 left, top, right, bottom; - if (!crop.IsEmpty()) { - // If crop rectangle is not empty, apply properties from rectangle. - left = static_cast(crop.left); - top = static_cast(crop.top); - right = static_cast(crop.right); - bottom = static_cast(crop.bottom); - } else { - // Otherwise, fall back to framebuffer dimensions. + if (fsr) { + // FSR has already applied the crop, so we just want to render the image + // it has produced. left = 0; top = 0; - right = static_cast(framebuffer.width); - bottom = static_cast(framebuffer.height); - } - - // Apply transformation flags. - auto framebuffer_transform_flags = framebuffer.transform_flags; - - if (True(framebuffer_transform_flags & Service::android::BufferTransformFlags::FlipH)) { - // Switch left and right. - std::swap(left, right); - } - if (True(framebuffer_transform_flags & Service::android::BufferTransformFlags::FlipV)) { - // Switch top and bottom. - std::swap(top, bottom); - } - - framebuffer_transform_flags &= ~Service::android::BufferTransformFlags::FlipH; - framebuffer_transform_flags &= ~Service::android::BufferTransformFlags::FlipV; - if (True(framebuffer_transform_flags)) { - UNIMPLEMENTED_MSG("Unsupported framebuffer_transform_flags={}", - static_cast(framebuffer_transform_flags)); + right = 1; + bottom = 1; + } else { + // Get the normalized crop rectangle. + const auto crop = NormalizeCrop(framebuffer, screen_info); + + // Apply the crop. + left = crop.left; + top = crop.top; + right = crop.right; + bottom = crop.bottom; } - return Common::Rectangle(left, top, right, bottom); -} - -void BlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, - const Layout::FramebufferLayout layout) const { - // Get the normalized crop rectangle. - const auto crop = NormalizeCrop(framebuffer.crop_rect, framebuffer); - - // Get the screen properties. - const f32 screen_width = static_cast(screen_info.width); - const f32 screen_height = static_cast(screen_info.height); - - // Apply the crop. - const f32 left = crop.left / screen_width; - const f32 top = crop.top / screen_height; - const f32 right = crop.right / screen_width; - const f32 bottom = crop.bottom / screen_height; - // Map the coordinates to the screen. const auto& screen = layout.screen; const auto x = static_cast(screen.left); diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp index ce8f3f3c2..f7a05fbc0 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.cpp +++ b/src/video_core/renderer_vulkan/vk_fsr.cpp @@ -34,7 +34,7 @@ FSR::FSR(const Device& device_, MemoryAllocator& memory_allocator_, size_t image } VkImageView FSR::Draw(Scheduler& scheduler, size_t image_index, VkImageView image_view, - VkExtent2D input_image_extent, const Common::Rectangle& crop_rect) { + VkExtent2D input_image_extent, const Common::Rectangle& crop_rect) { UpdateDescriptorSet(image_index, image_view); @@ -61,15 +61,21 @@ VkImageView FSR::Draw(Scheduler& scheduler, size_t image_index, VkImageView imag cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *easu_pipeline); + const f32 input_image_width = static_cast(input_image_extent.width); + const f32 input_image_height = static_cast(input_image_extent.height); + const f32 output_image_width = static_cast(output_size.width); + const f32 output_image_height = static_cast(output_size.height); + const f32 viewport_width = (crop_rect.right - crop_rect.left) * input_image_width; + const f32 viewport_x = crop_rect.left * input_image_width; + const f32 viewport_height = (crop_rect.bottom - crop_rect.top) * input_image_height; + const f32 viewport_y = crop_rect.top * input_image_height; + std::array push_constants; - FsrEasuConOffset( - push_constants.data() + 0, push_constants.data() + 4, push_constants.data() + 8, - push_constants.data() + 12, - - static_cast(crop_rect.GetWidth()), static_cast(crop_rect.GetHeight()), - static_cast(input_image_extent.width), static_cast(input_image_extent.height), - static_cast(output_size.width), static_cast(output_size.height), - static_cast(crop_rect.left), static_cast(crop_rect.top)); + FsrEasuConOffset(push_constants.data() + 0, push_constants.data() + 4, + push_constants.data() + 8, push_constants.data() + 12, + + viewport_width, viewport_height, input_image_width, input_image_height, + output_image_width, output_image_height, viewport_x, viewport_y); cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, push_constants); { diff --git a/src/video_core/renderer_vulkan/vk_fsr.h b/src/video_core/renderer_vulkan/vk_fsr.h index 8bb9fc23a..3505c1416 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.h +++ b/src/video_core/renderer_vulkan/vk_fsr.h @@ -17,7 +17,7 @@ public: explicit FSR(const Device& device, MemoryAllocator& memory_allocator, size_t image_count, VkExtent2D output_size); VkImageView Draw(Scheduler& scheduler, size_t image_index, VkImageView image_view, - VkExtent2D input_image_extent, const Common::Rectangle& crop_rect); + VkExtent2D input_image_extent, const Common::Rectangle& crop_rect); private: void CreateDescriptorPool(); -- cgit v1.2.3 From 41701052d3ebbd2ed746beef342e1bdeaa9374e6 Mon Sep 17 00:00:00 2001 From: Liam Date: Wed, 1 Nov 2023 20:47:08 -0400 Subject: renderer_vulkan: minimize transform feedback support log --- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 059b7cb40..3983b2eb7 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -923,9 +923,13 @@ void RasterizerVulkan::UpdateDynamicStates() { } void RasterizerVulkan::HandleTransformFeedback() { + static std::once_flag warn_unsupported; + const auto& regs = maxwell3d->regs; if (!device.IsExtTransformFeedbackSupported()) { - LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); + std::call_once(warn_unsupported, [&] { + LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); + }); return; } query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount, -- cgit v1.2.3 From 75de0cadcfaa483393a832fc804d80382f61d885 Mon Sep 17 00:00:00 2001 From: Liam Date: Fri, 3 Nov 2023 20:54:38 -0400 Subject: renderer_null: fix --- src/video_core/renderer_null/null_rasterizer.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp index 65cd5aa06..4f1d5b548 100644 --- a/src/video_core/renderer_null/null_rasterizer.cpp +++ b/src/video_core/renderer_null/null_rasterizer.cpp @@ -3,6 +3,7 @@ #include "common/alignment.h" #include "core/memory.h" +#include "video_core/control/channel_state.h" #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" #include "video_core/renderer_null/null_rasterizer.h" @@ -99,8 +100,14 @@ bool RasterizerNull::AccelerateDisplay(const Tegra::FramebufferConfig& config, } void RasterizerNull::LoadDiskResources(u64 title_id, std::stop_token stop_loading, const VideoCore::DiskResourceLoadCallback& callback) {} -void RasterizerNull::InitializeChannel(Tegra::Control::ChannelState& channel) {} -void RasterizerNull::BindChannel(Tegra::Control::ChannelState& channel) {} -void RasterizerNull::ReleaseChannel(s32 channel_id) {} +void RasterizerNull::InitializeChannel(Tegra::Control::ChannelState& channel) { + CreateChannel(channel); +} +void RasterizerNull::BindChannel(Tegra::Control::ChannelState& channel) { + BindToChannel(channel.bind_id); +} +void RasterizerNull::ReleaseChannel(s32 channel_id) { + EraseChannel(channel_id); +} } // namespace Null -- cgit v1.2.3 From a423e0f9e0f7d759f22474e93a18cedeb8ab418c Mon Sep 17 00:00:00 2001 From: liamwhite Date: Sun, 5 Nov 2023 15:47:35 -0500 Subject: renderer_vulkan: render on bottom of surface clip when flipped (#11894) --- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 3983b2eb7..c0e8431e4 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -82,7 +82,7 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in } if (y_negate) { - y += height; + y += conv(static_cast(regs.surface_clip.height)); height = -height; } -- cgit v1.2.3 From 1d03a0fa7598cc8bafaf9edc8796eb0137ee7876 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 10 Nov 2023 15:40:48 +0100 Subject: Revert "renderer_vulkan: add locks to avoid scheduler flushes from CPU" This reverts commit d9dde7e6f3a90f58d642808900ddd558da21f762. --- src/video_core/fence_manager.h | 5 +---- src/video_core/renderer_vulkan/renderer_vulkan.cpp | 14 +++++--------- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 10 ++++------ src/video_core/renderer_vulkan/vk_rasterizer.h | 4 ---- 4 files changed, 10 insertions(+), 23 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index c0e6471fe..805a89900 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -86,10 +86,7 @@ public: uncommitted_operations.emplace_back(std::move(func)); } pending_operations.emplace_back(std::move(uncommitted_operations)); - { - std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; - QueueFence(new_fence); - } + QueueFence(new_fence); if (!delay_fence) { func(); } diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 7e7a80740..c4c30d807 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -132,16 +132,12 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { const bool use_accelerated = rasterizer.AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); const bool is_srgb = use_accelerated && screen_info.is_srgb; + RenderScreenshot(*framebuffer, use_accelerated); - { - std::scoped_lock lock{rasterizer.LockCaches()}; - RenderScreenshot(*framebuffer, use_accelerated); - - Frame* frame = present_manager.GetRenderFrame(); - blit_screen.DrawToSwapchain(frame, *framebuffer, use_accelerated, is_srgb); - scheduler.Flush(*frame->render_ready); - present_manager.Present(frame); - } + Frame* frame = present_manager.GetRenderFrame(); + blit_screen.DrawToSwapchain(frame, *framebuffer, use_accelerated, is_srgb); + scheduler.Flush(*frame->render_ready); + present_manager.Present(frame); gpu.RendererFrameEndNotify(); rasterizer.TickFrame(); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index c0e8431e4..3bfaabc49 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -199,7 +199,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { if (!pipeline) { return; } - std::scoped_lock lock{LockCaches()}; + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; // update engine as channel may be different. pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); @@ -710,7 +710,6 @@ void RasterizerVulkan::TiledCacheBarrier() { } void RasterizerVulkan::FlushCommands() { - std::scoped_lock lock{LockCaches()}; if (draw_counter == 0) { return; } @@ -808,7 +807,6 @@ void RasterizerVulkan::FlushWork() { if ((++draw_counter & 7) != 7) { return; } - std::scoped_lock lock{LockCaches()}; if (draw_counter < DRAWS_TO_DISPATCH) { // Send recorded tasks to the worker thread scheduler.DispatchWork(); @@ -1507,7 +1505,7 @@ void RasterizerVulkan::UpdateVertexInput(Tegra::Engines::Maxwell3D::Regs& regs) void RasterizerVulkan::InitializeChannel(Tegra::Control::ChannelState& channel) { CreateChannel(channel); { - std::scoped_lock lock{LockCaches()}; + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; texture_cache.CreateChannel(channel); buffer_cache.CreateChannel(channel); } @@ -1520,7 +1518,7 @@ void RasterizerVulkan::BindChannel(Tegra::Control::ChannelState& channel) { const s32 channel_id = channel.bind_id; BindToChannel(channel_id); { - std::scoped_lock lock{LockCaches()}; + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; texture_cache.BindToChannel(channel_id); buffer_cache.BindToChannel(channel_id); } @@ -1533,7 +1531,7 @@ void RasterizerVulkan::BindChannel(Tegra::Control::ChannelState& channel) { void RasterizerVulkan::ReleaseChannel(s32 channel_id) { EraseChannel(channel_id); { - std::scoped_lock lock{LockCaches()}; + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; texture_cache.EraseChannel(channel_id); buffer_cache.EraseChannel(channel_id); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index ce3dfbaab..ad069556c 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -133,10 +133,6 @@ public: void ReleaseChannel(s32 channel_id) override; - std::scoped_lock LockCaches() { - return std::scoped_lock{buffer_cache.mutex, texture_cache.mutex}; - } - private: static constexpr size_t MAX_TEXTURES = 192; static constexpr size_t MAX_IMAGES = 48; -- cgit v1.2.3 From f1806d237f57f2c0944f6ae4721ac9497de5b4cf Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 10 Nov 2023 16:43:56 +0100 Subject: Memory: Fix invalidation handling from the CPU/Services --- src/video_core/renderer_opengl/gl_rasterizer.cpp | 2 +- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 27e2de1bf..9995b6dd4 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -555,7 +555,7 @@ void RasterizerOpenGL::OnCacheInvalidation(VAddr addr, u64 size) { } { std::scoped_lock lock{buffer_cache.mutex}; - buffer_cache.CachedWriteMemory(addr, size); + buffer_cache.WriteMemory(addr, size); } shader_cache.InvalidateRegion(addr, size); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 3bfaabc49..e0ab1eaac 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -621,7 +621,7 @@ void RasterizerVulkan::OnCacheInvalidation(VAddr addr, u64 size) { } { std::scoped_lock lock{buffer_cache.mutex}; - buffer_cache.CachedWriteMemory(addr, size); + buffer_cache.WriteMemory(addr, size); } pipeline_cache.InvalidateRegion(addr, size); } -- cgit v1.2.3 From efc50485b80e4fde656897b7c9c32f2dbb78977b Mon Sep 17 00:00:00 2001 From: GPUCode Date: Sat, 9 Sep 2023 17:28:06 +0300 Subject: renderer_vulkan: Introduce separate cmd buffer for uploads --- src/video_core/CMakeLists.txt | 1 + src/video_core/buffer_cache/buffer_cache.h | 97 ++++++++-------------- src/video_core/buffer_cache/buffer_cache_base.h | 4 - src/video_core/buffer_cache/usage_tracker.h | 79 ++++++++++++++++++ src/video_core/renderer_opengl/gl_buffer_cache.cpp | 7 +- src/video_core/renderer_opengl/gl_buffer_cache.h | 17 +++- src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 36 +++++++- src/video_core/renderer_vulkan/vk_buffer_cache.h | 21 ++++- .../renderer_vulkan/vk_master_semaphore.cpp | 29 ++++--- .../renderer_vulkan/vk_master_semaphore.h | 14 ++-- src/video_core/renderer_vulkan/vk_scheduler.cpp | 20 +++-- src/video_core/renderer_vulkan/vk_scheduler.h | 21 +++-- src/video_core/renderer_vulkan/vk_smaa.cpp | 4 +- .../renderer_vulkan/vk_staging_buffer_pool.h | 4 + src/video_core/texture_cache/slot_vector.h | 4 + src/video_core/vulkan_common/vulkan_wrapper.h | 4 + 16 files changed, 256 insertions(+), 106 deletions(-) create mode 100644 src/video_core/buffer_cache/usage_tracker.h (limited to 'src/video_core') diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index cf9266d54..336532e0b 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -15,6 +15,7 @@ add_library(video_core STATIC buffer_cache/buffer_cache.cpp buffer_cache/buffer_cache.h buffer_cache/memory_tracker_base.h + buffer_cache/usage_tracker.h buffer_cache/word_manager.h cache_types.h cdma_pusher.cpp diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 081a574e8..813b68963 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -67,6 +67,7 @@ void BufferCache

::TickFrame() { if (!channel_state) { return; } + runtime.TickFrame(slot_buffers); // Calculate hits and shots and move hit bits to the right const u32 hits = std::reduce(channel_state->uniform_cache_hits.begin(), @@ -230,7 +231,10 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am for (const IntervalType& add_interval : tmp_intervals) { common_ranges.add(add_interval); } - runtime.CopyBuffer(dest_buffer, src_buffer, copies); + const auto& copy = copies[0]; + src_buffer.MarkUsage(copy.src_offset, copy.size); + dest_buffer.MarkUsage(copy.dst_offset, copy.size); + runtime.CopyBuffer(dest_buffer, src_buffer, copies, true); if (has_new_downloads) { memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); } @@ -258,9 +262,10 @@ bool BufferCache

::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) { common_ranges.subtract(subtract_interval); const BufferId buffer = FindBuffer(*cpu_dst_address, static_cast(size)); - auto& dest_buffer = slot_buffers[buffer]; + Buffer& dest_buffer = slot_buffers[buffer]; const u32 offset = dest_buffer.Offset(*cpu_dst_address); runtime.ClearBuffer(dest_buffer, offset, size, value); + dest_buffer.MarkUsage(offset, size); return true; } @@ -603,6 +608,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { VAddr orig_cpu_addr = static_cast(second_copy.src_offset); const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; async_downloads += std::make_pair(base_interval, 1); + buffer.MarkUsage(copy.src_offset, copy.size); runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); normalized_copies.push_back(second_copy); } @@ -621,8 +627,9 @@ void BufferCache

::CommitAsyncFlushesHigh() { // Have in mind the staging buffer offset for the copy copy.dst_offset += download_staging.offset; const std::array copies{copy}; - runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, - false); + Buffer& buffer = slot_buffers[buffer_id]; + buffer.MarkUsage(copy.src_offset, copy.size); + runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); } runtime.PostCopyBarrier(); runtime.Finish(); @@ -742,7 +749,7 @@ void BufferCache

::BindHostIndexBuffer() { {BufferCopy{.src_offset = upload_staging.offset, .dst_offset = 0, .size = size}}}; std::memcpy(upload_staging.mapped_span.data(), draw_state.inline_index_draw_indexes.data(), size); - runtime.CopyBuffer(buffer, upload_staging.buffer, copies); + runtime.CopyBuffer(buffer, upload_staging.buffer, copies, true); } else { buffer.ImmediateUpload(0, draw_state.inline_index_draw_indexes); } @@ -754,6 +761,7 @@ void BufferCache

::BindHostIndexBuffer() { offset + draw_state.index_buffer.first * draw_state.index_buffer.FormatSizeInBytes(); runtime.BindIndexBuffer(buffer, new_offset, size); } else { + buffer.MarkUsage(offset, size); runtime.BindIndexBuffer(draw_state.topology, draw_state.index_buffer.format, draw_state.index_buffer.first, draw_state.index_buffer.count, buffer, offset, size); @@ -790,6 +798,7 @@ void BufferCache

::BindHostVertexBuffers() { const u32 stride = maxwell3d->regs.vertex_streams[index].stride; const u32 offset = buffer.Offset(binding.cpu_addr); + buffer.MarkUsage(offset, binding.size); host_bindings.buffers.push_back(&buffer); host_bindings.offsets.push_back(offset); @@ -895,6 +904,7 @@ void BufferCache

::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { channel_state->uniform_buffer_binding_sizes[stage][binding_index] = size; } + buffer.MarkUsage(offset, size); if constexpr (NEEDS_BIND_UNIFORM_INDEX) { runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size); } else { @@ -913,6 +923,7 @@ void BufferCache

::BindHostGraphicsStorageBuffers(size_t stage) { SynchronizeBuffer(buffer, binding.cpu_addr, size); const u32 offset = buffer.Offset(binding.cpu_addr); + buffer.MarkUsage(offset, size); const bool is_written = ((channel_state->written_storage_buffers[stage] >> index) & 1) != 0; if (is_written) { @@ -943,6 +954,7 @@ void BufferCache

::BindHostGraphicsTextureBuffers(size_t stage) { const u32 offset = buffer.Offset(binding.cpu_addr); const PixelFormat format = binding.format; + buffer.MarkUsage(offset, size); if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) { if (((channel_state->image_texture_buffers[stage] >> index) & 1) != 0) { runtime.BindImageBuffer(buffer, offset, size, format); @@ -975,9 +987,10 @@ void BufferCache

::BindHostTransformFeedbackBuffers() { MarkWrittenBuffer(binding.buffer_id, binding.cpu_addr, size); const u32 offset = buffer.Offset(binding.cpu_addr); + buffer.MarkUsage(offset, size); host_bindings.buffers.push_back(&buffer); host_bindings.offsets.push_back(offset); - host_bindings.sizes.push_back(binding.size); + host_bindings.sizes.push_back(size); } if (host_bindings.buffers.size() > 0) { runtime.BindTransformFeedbackBuffers(host_bindings); @@ -1001,6 +1014,7 @@ void BufferCache

::BindHostComputeUniformBuffers() { SynchronizeBuffer(buffer, binding.cpu_addr, size); const u32 offset = buffer.Offset(binding.cpu_addr); + buffer.MarkUsage(offset, size); if constexpr (NEEDS_BIND_UNIFORM_INDEX) { runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size); ++binding_index; @@ -1021,6 +1035,7 @@ void BufferCache

::BindHostComputeStorageBuffers() { SynchronizeBuffer(buffer, binding.cpu_addr, size); const u32 offset = buffer.Offset(binding.cpu_addr); + buffer.MarkUsage(offset, size); const bool is_written = ((channel_state->written_compute_storage_buffers >> index) & 1) != 0; @@ -1053,6 +1068,7 @@ void BufferCache

::BindHostComputeTextureBuffers() { const u32 offset = buffer.Offset(binding.cpu_addr); const PixelFormat format = binding.format; + buffer.MarkUsage(offset, size); if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) { if (((channel_state->image_compute_texture_buffers >> index) & 1) != 0) { runtime.BindImageBuffer(buffer, offset, size, format); @@ -1172,10 +1188,11 @@ void BufferCache

::UpdateVertexBuffer(u32 index) { if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) { size = static_cast(gpu_memory->MaxContinuousRange(gpu_addr_begin, size)); } + const BufferId buffer_id = FindBuffer(*cpu_addr, size); channel_state->vertex_buffers[index] = Binding{ .cpu_addr = *cpu_addr, .size = size, - .buffer_id = FindBuffer(*cpu_addr, size), + .buffer_id = buffer_id, }; } @@ -1406,7 +1423,8 @@ void BufferCache

::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, .dst_offset = dst_base_offset, .size = overlap.SizeBytes(), }); - runtime.CopyBuffer(new_buffer, overlap, copies); + new_buffer.MarkUsage(copies[0].dst_offset, copies[0].size); + runtime.CopyBuffer(new_buffer, overlap, copies, true); DeleteBuffer(overlap_id, true); } @@ -1419,7 +1437,9 @@ BufferId BufferCache

::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { const u32 size = static_cast(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); auto& new_buffer = slot_buffers[new_buffer_id]; - runtime.ClearBuffer(new_buffer, 0, new_buffer.SizeBytes(), 0); + const size_t size_bytes = new_buffer.SizeBytes(); + runtime.ClearBuffer(new_buffer, 0, size_bytes, 0); + new_buffer.MarkUsage(0, size_bytes); for (const BufferId overlap_id : overlap.ids) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } @@ -1472,11 +1492,6 @@ void BufferCache

::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept { template bool BufferCache

::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { - return SynchronizeBufferImpl(buffer, cpu_addr, size); -} - -template -bool BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) { boost::container::small_vector copies; u64 total_size_bytes = 0; u64 largest_copy = 0; @@ -1498,51 +1513,6 @@ bool BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s return false; } -template -bool BufferCache

::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) { - boost::container::small_vector copies; - u64 total_size_bytes = 0; - u64 largest_copy = 0; - IntervalSet found_sets{}; - auto make_copies = [&] { - for (auto& interval : found_sets) { - const std::size_t sub_size = interval.upper() - interval.lower(); - const VAddr cpu_addr_ = interval.lower(); - copies.push_back(BufferCopy{ - .src_offset = total_size_bytes, - .dst_offset = cpu_addr_ - buffer.CpuAddr(), - .size = sub_size, - }); - total_size_bytes += sub_size; - largest_copy = std::max(largest_copy, sub_size); - } - const std::span copies_span(copies.data(), copies.size()); - UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); - }; - memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { - const VAddr base_adr = cpu_addr_out; - const VAddr end_adr = base_adr + range_size; - const IntervalType add_interval{base_adr, end_adr}; - found_sets.add(add_interval); - }); - if (found_sets.empty()) { - return true; - } - const IntervalType search_interval{cpu_addr, cpu_addr + size}; - auto it = common_ranges.lower_bound(search_interval); - auto it_end = common_ranges.upper_bound(search_interval); - if (it == common_ranges.end()) { - make_copies(); - return false; - } - while (it != it_end) { - found_sets.subtract(*it); - it++; - } - make_copies(); - return false; -} - template void BufferCache

::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span copies) { @@ -1591,7 +1561,8 @@ void BufferCache

::MappedUploadMemory([[maybe_unused]] Buffer& buffer, // Apply the staging offset copy.src_offset += upload_staging.offset; } - runtime.CopyBuffer(buffer, upload_staging.buffer, copies); + const bool can_reorder = runtime.CanReorderUpload(buffer, copies); + runtime.CopyBuffer(buffer, upload_staging.buffer, copies, true, can_reorder); } } @@ -1633,7 +1604,8 @@ void BufferCache

::InlineMemoryImplementation(VAddr dest_address, size_t copy_ }}; u8* const src_pointer = upload_staging.mapped_span.data(); std::memcpy(src_pointer, inlined_buffer.data(), copy_size); - runtime.CopyBuffer(buffer, upload_staging.buffer, copies); + const bool can_reorder = runtime.CanReorderUpload(buffer, copies); + runtime.CopyBuffer(buffer, upload_staging.buffer, copies, true, can_reorder); } else { buffer.ImmediateUpload(buffer.Offset(dest_address), inlined_buffer.first(copy_size)); } @@ -1686,8 +1658,9 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si for (BufferCopy& copy : copies) { // Modify copies to have the staging offset in mind copy.dst_offset += download_staging.offset; + buffer.MarkUsage(copy.src_offset, copy.size); } - runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); + runtime.CopyBuffer(download_staging.buffer, buffer, copies_span, true); runtime.Finish(); for (const BufferCopy& copy : copies) { const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index eed267361..d6d696d8c 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -529,10 +529,6 @@ private: bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); - bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); - - bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size); - void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span copies); diff --git a/src/video_core/buffer_cache/usage_tracker.h b/src/video_core/buffer_cache/usage_tracker.h new file mode 100644 index 000000000..ab05fe415 --- /dev/null +++ b/src/video_core/buffer_cache/usage_tracker.h @@ -0,0 +1,79 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include "common/alignment.h" +#include "common/common_types.h" + +namespace VideoCommon { + +class UsageTracker { + static constexpr size_t BYTES_PER_BIT_SHIFT = 6; + static constexpr size_t PAGE_SHIFT = 6 + BYTES_PER_BIT_SHIFT; + static constexpr size_t PAGE_BYTES = 1 << PAGE_SHIFT; + +public: + explicit UsageTracker(size_t size) { + const size_t num_pages = (size >> PAGE_SHIFT) + 1; + pages.resize(num_pages, 0ULL); + } + + void Reset() noexcept { + std::ranges::fill(pages, 0ULL); + } + + void Track(u64 offset, u64 size) noexcept { + const size_t page = offset >> PAGE_SHIFT; + const size_t page_end = (offset + size) >> PAGE_SHIFT; + TrackPage(page, offset, size); + if (page == page_end) { + return; + } + for (size_t i = page + 1; i < page_end; i++) { + pages[i] = ~u64{0}; + } + const size_t offset_end = offset + size; + const size_t offset_end_page_aligned = Common::AlignDown(offset_end, PAGE_BYTES); + TrackPage(page_end, offset_end_page_aligned, offset_end - offset_end_page_aligned); + } + + [[nodiscard]] bool IsUsed(u64 offset, u64 size) const noexcept { + const size_t page = offset >> PAGE_SHIFT; + const size_t page_end = (offset + size) >> PAGE_SHIFT; + if (IsPageUsed(page, offset, size)) { + return true; + } + for (size_t i = page + 1; i < page_end; i++) { + if (pages[i] != 0) { + return true; + } + } + const size_t offset_end = offset + size; + const size_t offset_end_page_aligned = Common::AlignDown(offset_end, PAGE_BYTES); + return IsPageUsed(page_end, offset_end_page_aligned, offset_end - offset_end_page_aligned); + } + +private: + void TrackPage(u64 page, u64 offset, u64 size) noexcept { + const size_t offset_in_page = offset % PAGE_BYTES; + const size_t first_bit = offset_in_page >> BYTES_PER_BIT_SHIFT; + const size_t num_bits = std::min(size, PAGE_BYTES) >> BYTES_PER_BIT_SHIFT; + const size_t mask = ~u64{0} >> (64 - num_bits); + pages[page] |= (~u64{0} & mask) << first_bit; + } + + bool IsPageUsed(u64 page, u64 offset, u64 size) const noexcept { + const size_t offset_in_page = offset % PAGE_BYTES; + const size_t first_bit = offset_in_page >> BYTES_PER_BIT_SHIFT; + const size_t num_bits = std::min(size, PAGE_BYTES) >> BYTES_PER_BIT_SHIFT; + const size_t mask = ~u64{0} >> (64 - num_bits); + const size_t mask2 = (~u64{0} & mask) << first_bit; + return (pages[page] & mask2) != 0; + } + +private: + std::vector pages; +}; + +} // namespace VideoCommon diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 38d553d3c..dfd696de6 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -178,13 +178,14 @@ void BufferCacheRuntime::CopyBuffer(GLuint dst_buffer, Buffer& src_buffer, } void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, GLuint src_buffer, - std::span copies, bool barrier) { + std::span copies, bool barrier, + bool) { CopyBuffer(dst_buffer.Handle(), src_buffer, copies, barrier); } void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, - std::span copies) { - CopyBuffer(dst_buffer.Handle(), src_buffer.Handle(), copies); + std::span copies, bool) { + CopyBuffer(dst_buffer.Handle(), src_buffer.Handle(), copies, true); } void BufferCacheRuntime::PreCopyBarrier() { diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 41b746f3b..feccf06f9 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -30,6 +30,8 @@ public: void MakeResident(GLenum access) noexcept; + void MarkUsage(u64 offset, u64 size) {} + [[nodiscard]] GLuint View(u32 offset, u32 size, VideoCore::Surface::PixelFormat format); [[nodiscard]] GLuint64EXT HostGpuAddr() const noexcept { @@ -66,22 +68,29 @@ public: [[nodiscard]] StagingBufferMap DownloadStagingBuffer(size_t size); + bool CanReorderUpload(const Buffer&, std::span) { + return false; + } + void CopyBuffer(GLuint dst_buffer, GLuint src_buffer, - std::span copies, bool barrier = true); + std::span copies, bool barrier); void CopyBuffer(GLuint dst_buffer, Buffer& src_buffer, - std::span copies, bool barrier = true); + std::span copies, bool barrier); void CopyBuffer(Buffer& dst_buffer, GLuint src_buffer, - std::span copies, bool barrier = true); + std::span copies, bool barrier, + bool can_reorder_upload = false); void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, - std::span copies); + std::span copies, bool); void PreCopyBarrier(); void PostCopyBarrier(); void Finish(); + void TickFrame(VideoCommon::SlotVector&) noexcept {} + void ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value); void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size); diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index d8148e89a..7691cc2ba 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -79,13 +79,13 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo } // Anonymous namespace Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params) - : VideoCommon::BufferBase(null_params) {} + : VideoCommon::BufferBase(null_params), tracker{4096} {} Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes_) : VideoCommon::BufferBase(rasterizer_, cpu_addr_, size_bytes_), - device{&runtime.device}, buffer{ - CreateBuffer(*device, runtime.memory_allocator, SizeBytes())} { + device{&runtime.device}, buffer{CreateBuffer(*device, runtime.memory_allocator, SizeBytes())}, + tracker{SizeBytes()} { if (runtime.device.HasDebuggingToolAttached()) { buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str()); } @@ -355,12 +355,31 @@ bool BufferCacheRuntime::CanReportMemoryUsage() const { return device.CanReportMemoryUsage(); } +void BufferCacheRuntime::TickFrame(VideoCommon::SlotVector& slot_buffers) noexcept { + for (auto it = slot_buffers.begin(); it != slot_buffers.end(); it++) { + it->ResetUsageTracking(); + } +} + void BufferCacheRuntime::Finish() { scheduler.Finish(); } +bool BufferCacheRuntime::CanReorderUpload(const Buffer& buffer, + std::span copies) { + if (Settings::values.disable_buffer_reorder) { + return false; + } + const bool can_use_upload_cmdbuf = + std::ranges::all_of(copies, [&](const VideoCommon::BufferCopy& copy) { + return !buffer.IsRegionUsed(copy.dst_offset, copy.size); + }); + return can_use_upload_cmdbuf; +} + void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, - std::span copies, bool barrier) { + std::span copies, bool barrier, + bool can_reorder_upload) { if (dst_buffer == VK_NULL_HANDLE || src_buffer == VK_NULL_HANDLE) { return; } @@ -376,9 +395,18 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, }; + // Measuring a popular game, this number never exceeds the specified size once data is warmed up boost::container::small_vector vk_copies(copies.size()); std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy); + if (src_buffer == staging_pool.StreamBuf() && can_reorder_upload) { + scheduler.RecordWithUploadBuffer([src_buffer, dst_buffer, vk_copies]( + vk::CommandBuffer, vk::CommandBuffer upload_cmdbuf) { + upload_cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies); + }); + return; + } + scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([src_buffer, dst_buffer, vk_copies, barrier](vk::CommandBuffer cmdbuf) { if (barrier) { diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 95446c732..4416a902f 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -5,6 +5,7 @@ #include "video_core/buffer_cache/buffer_cache_base.h" #include "video_core/buffer_cache/memory_tracker_base.h" +#include "video_core/buffer_cache/usage_tracker.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -34,6 +35,18 @@ public: return *buffer; } + [[nodiscard]] bool IsRegionUsed(u64 offset, u64 size) const noexcept { + return tracker.IsUsed(offset, size); + } + + void MarkUsage(u64 offset, u64 size) noexcept { + tracker.Track(offset, size); + } + + void ResetUsageTracking() noexcept { + tracker.Reset(); + } + operator VkBuffer() const noexcept { return *buffer; } @@ -49,6 +62,7 @@ private: const Device* device{}; vk::Buffer buffer; std::vector views; + VideoCommon::UsageTracker tracker; }; class QuadArrayIndexBuffer; @@ -67,6 +81,8 @@ public: ComputePassDescriptorQueue& compute_pass_descriptor_queue, DescriptorPool& descriptor_pool); + void TickFrame(VideoCommon::SlotVector& slot_buffers) noexcept; + void Finish(); u64 GetDeviceLocalMemory() const; @@ -79,12 +95,15 @@ public: [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false); + bool CanReorderUpload(const Buffer& buffer, std::span copies); + void FreeDeferredStagingBuffer(StagingBufferRef& ref); void PreCopyBarrier(); void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer, - std::span copies, bool barrier = true); + std::span copies, bool barrier, + bool can_reorder_upload = false); void PostCopyBarrier(); diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp index 6b288b994..ac8b6e838 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -100,12 +100,14 @@ void MasterSemaphore::Wait(u64 tick) { Refresh(); } -VkResult MasterSemaphore::SubmitQueue(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore, - VkSemaphore wait_semaphore, u64 host_tick) { +VkResult MasterSemaphore::SubmitQueue(vk::CommandBuffer& cmdbuf, vk::CommandBuffer& upload_cmdbuf, + VkSemaphore signal_semaphore, VkSemaphore wait_semaphore, + u64 host_tick) { if (semaphore) { - return SubmitQueueTimeline(cmdbuf, signal_semaphore, wait_semaphore, host_tick); + return SubmitQueueTimeline(cmdbuf, upload_cmdbuf, signal_semaphore, wait_semaphore, + host_tick); } else { - return SubmitQueueFence(cmdbuf, signal_semaphore, wait_semaphore, host_tick); + return SubmitQueueFence(cmdbuf, upload_cmdbuf, signal_semaphore, wait_semaphore, host_tick); } } @@ -115,6 +117,7 @@ static constexpr std::array wait_stage_masks{ }; VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, + vk::CommandBuffer& upload_cmdbuf, VkSemaphore signal_semaphore, VkSemaphore wait_semaphore, u64 host_tick) { const VkSemaphore timeline_semaphore = *semaphore; @@ -123,6 +126,8 @@ VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, const std::array signal_values{host_tick, u64(0)}; const std::array signal_semaphores{timeline_semaphore, signal_semaphore}; + const std::array cmdbuffers{*upload_cmdbuf, *cmdbuf}; + const u32 num_wait_semaphores = wait_semaphore ? 1 : 0; const VkTimelineSemaphoreSubmitInfo timeline_si{ .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, @@ -138,8 +143,8 @@ VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, .waitSemaphoreCount = num_wait_semaphores, .pWaitSemaphores = &wait_semaphore, .pWaitDstStageMask = wait_stage_masks.data(), - .commandBufferCount = 1, - .pCommandBuffers = cmdbuf.address(), + .commandBufferCount = static_cast(cmdbuffers.size()), + .pCommandBuffers = cmdbuffers.data(), .signalSemaphoreCount = num_signal_semaphores, .pSignalSemaphores = signal_semaphores.data(), }; @@ -147,19 +152,23 @@ VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, return device.GetGraphicsQueue().Submit(submit_info); } -VkResult MasterSemaphore::SubmitQueueFence(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore, - VkSemaphore wait_semaphore, u64 host_tick) { +VkResult MasterSemaphore::SubmitQueueFence(vk::CommandBuffer& cmdbuf, + vk::CommandBuffer& upload_cmdbuf, + VkSemaphore signal_semaphore, VkSemaphore wait_semaphore, + u64 host_tick) { const u32 num_signal_semaphores = signal_semaphore ? 1 : 0; const u32 num_wait_semaphores = wait_semaphore ? 1 : 0; + const std::array cmdbuffers{*upload_cmdbuf, *cmdbuf}; + const VkSubmitInfo submit_info{ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .pNext = nullptr, .waitSemaphoreCount = num_wait_semaphores, .pWaitSemaphores = &wait_semaphore, .pWaitDstStageMask = wait_stage_masks.data(), - .commandBufferCount = 1, - .pCommandBuffers = cmdbuf.address(), + .commandBufferCount = static_cast(cmdbuffers.size()), + .pCommandBuffers = cmdbuffers.data(), .signalSemaphoreCount = num_signal_semaphores, .pSignalSemaphores = &signal_semaphore, }; diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h index 3f599d7bd..7dfb93ffb 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.h +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -52,14 +52,16 @@ public: void Wait(u64 tick); /// Submits the device graphics queue, updating the tick as necessary - VkResult SubmitQueue(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore, - VkSemaphore wait_semaphore, u64 host_tick); + VkResult SubmitQueue(vk::CommandBuffer& cmdbuf, vk::CommandBuffer& upload_cmdbuf, + VkSemaphore signal_semaphore, VkSemaphore wait_semaphore, u64 host_tick); private: - VkResult SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore, - VkSemaphore wait_semaphore, u64 host_tick); - VkResult SubmitQueueFence(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore, - VkSemaphore wait_semaphore, u64 host_tick); + VkResult SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, vk::CommandBuffer& upload_cmdbuf, + VkSemaphore signal_semaphore, VkSemaphore wait_semaphore, + u64 host_tick); + VkResult SubmitQueueFence(vk::CommandBuffer& cmdbuf, vk::CommandBuffer& upload_cmdbuf, + VkSemaphore signal_semaphore, VkSemaphore wait_semaphore, + u64 host_tick); void WaitThread(std::stop_token token); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 3be7837f4..f1a9406ce 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -22,11 +22,12 @@ namespace Vulkan { MICROPROFILE_DECLARE(Vulkan_WaitForWorker); -void Scheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf) { +void Scheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf, + vk::CommandBuffer upload_cmdbuf) { auto command = first; while (command != nullptr) { auto next = command->GetNext(); - command->Execute(cmdbuf); + command->Execute(cmdbuf, upload_cmdbuf); command->~Command(); command = next; } @@ -180,7 +181,7 @@ void Scheduler::WorkerThread(std::stop_token stop_token) { // Perform the work, tracking whether the chunk was a submission // before executing. const bool has_submit = work->HasSubmit(); - work->ExecuteAll(current_cmdbuf); + work->ExecuteAll(current_cmdbuf, current_upload_cmdbuf); // If the chunk was a submission, reallocate the command buffer. if (has_submit) { @@ -205,6 +206,13 @@ void Scheduler::AllocateWorkerCommandBuffer() { .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, .pInheritanceInfo = nullptr, }); + current_upload_cmdbuf = vk::CommandBuffer(command_pool->Commit(), device.GetDispatchLoader()); + current_upload_cmdbuf.Begin({ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .pNext = nullptr, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + .pInheritanceInfo = nullptr, + }); } u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { @@ -212,7 +220,9 @@ u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_se InvalidateState(); const u64 signal_value = master_semaphore->NextTick(); - Record([signal_semaphore, wait_semaphore, signal_value, this](vk::CommandBuffer cmdbuf) { + RecordWithUploadBuffer([signal_semaphore, wait_semaphore, signal_value, + this](vk::CommandBuffer cmdbuf, vk::CommandBuffer upload_cmdbuf) { + upload_cmdbuf.End(); cmdbuf.End(); if (on_submit) { @@ -221,7 +231,7 @@ u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_se std::scoped_lock lock{submit_mutex}; switch (const VkResult result = master_semaphore->SubmitQueue( - cmdbuf, signal_semaphore, wait_semaphore, signal_value)) { + cmdbuf, upload_cmdbuf, signal_semaphore, wait_semaphore, signal_value)) { case VK_SUCCESS: break; case VK_ERROR_DEVICE_LOST: diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index da03803aa..f8d8ca80a 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -80,7 +80,8 @@ public: /// Send work to a separate thread. template - void Record(T&& command) { + requires std::is_invocable_v + void RecordWithUploadBuffer(T&& command) { if (chunk->Record(command)) { return; } @@ -88,6 +89,15 @@ public: (void)chunk->Record(command); } + template + requires std::is_invocable_v + void Record(T&& c) { + this->RecordWithUploadBuffer( + [command = std::move(c)](vk::CommandBuffer cmdbuf, vk::CommandBuffer) { + command(cmdbuf); + }); + } + /// Returns the current command buffer tick. [[nodiscard]] u64 CurrentTick() const noexcept { return master_semaphore->CurrentTick(); @@ -119,7 +129,7 @@ private: public: virtual ~Command() = default; - virtual void Execute(vk::CommandBuffer cmdbuf) const = 0; + virtual void Execute(vk::CommandBuffer cmdbuf, vk::CommandBuffer upload_cmdbuf) const = 0; Command* GetNext() const { return next; @@ -142,8 +152,8 @@ private: TypedCommand(TypedCommand&&) = delete; TypedCommand& operator=(TypedCommand&&) = delete; - void Execute(vk::CommandBuffer cmdbuf) const override { - command(cmdbuf); + void Execute(vk::CommandBuffer cmdbuf, vk::CommandBuffer upload_cmdbuf) const override { + command(cmdbuf, upload_cmdbuf); } private: @@ -152,7 +162,7 @@ private: class CommandChunk final { public: - void ExecuteAll(vk::CommandBuffer cmdbuf); + void ExecuteAll(vk::CommandBuffer cmdbuf, vk::CommandBuffer upload_cmdbuf); template bool Record(T& command) { @@ -228,6 +238,7 @@ private: VideoCommon::QueryCacheBase* query_cache = nullptr; vk::CommandBuffer current_cmdbuf; + vk::CommandBuffer current_upload_cmdbuf; std::unique_ptr chunk; std::function on_submit; diff --git a/src/video_core/renderer_vulkan/vk_smaa.cpp b/src/video_core/renderer_vulkan/vk_smaa.cpp index 5efd7d66e..70644ea82 100644 --- a/src/video_core/renderer_vulkan/vk_smaa.cpp +++ b/src/video_core/renderer_vulkan/vk_smaa.cpp @@ -672,7 +672,7 @@ void SMAA::UploadImages(Scheduler& scheduler) { UploadImage(m_device, m_allocator, scheduler, m_static_images[Search], search_extent, VK_FORMAT_R8_UNORM, ARRAY_TO_SPAN(searchTexBytes)); - scheduler.Record([&](vk::CommandBuffer& cmdbuf) { + scheduler.Record([&](vk::CommandBuffer cmdbuf) { for (auto& images : m_dynamic_images) { for (size_t i = 0; i < MaxDynamicImage; i++) { ClearColorImage(cmdbuf, *images.images[i]); @@ -707,7 +707,7 @@ VkImageView SMAA::Draw(Scheduler& scheduler, size_t image_index, VkImage source_ UpdateDescriptorSets(source_image_view, image_index); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([=, this](vk::CommandBuffer& cmdbuf) { + scheduler.Record([=, this](vk::CommandBuffer cmdbuf) { TransitionImageLayout(cmdbuf, source_image, VK_IMAGE_LAYOUT_GENERAL); TransitionImageLayout(cmdbuf, edges_image, VK_IMAGE_LAYOUT_GENERAL); BeginRenderPass(cmdbuf, m_renderpasses[EdgeDetection], edge_detection_framebuffer, diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index d3deb9072..f63a20327 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -36,6 +36,10 @@ public: StagingBufferRef Request(size_t size, MemoryUsage usage, bool deferred = false); void FreeDeferred(StagingBufferRef& ref); + [[nodiscard]] VkBuffer StreamBuf() const noexcept { + return *stream_buffer; + } + void TickFrame(); private: diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h index 9df6a2903..3ffa2a661 100644 --- a/src/video_core/texture_cache/slot_vector.h +++ b/src/video_core/texture_cache/slot_vector.h @@ -138,6 +138,10 @@ public: return Iterator(this, SlotId{SlotId::INVALID_INDEX}); } + [[nodiscard]] size_t size() const noexcept { + return values_capacity - free_list.size(); + } + private: struct NonTrivialDummy { NonTrivialDummy() noexcept {} diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 0487cd3b6..a0c70797f 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -1101,6 +1101,10 @@ public: return &handle; } + VkCommandBuffer operator*() const noexcept { + return handle; + } + void Begin(const VkCommandBufferBeginInfo& begin_info) const { Check(dld->vkBeginCommandBuffer(handle, &begin_info)); } -- cgit v1.2.3 From 50bcfa5fb9e796c4ac6159be54e1363c0cee200a Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 12 Nov 2023 20:58:30 +0100 Subject: Vulkan: Add a final barrier to the upload command buffer --- src/video_core/renderer_vulkan/vk_scheduler.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index f1a9406ce..146923db4 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -222,6 +222,14 @@ u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_se const u64 signal_value = master_semaphore->NextTick(); RecordWithUploadBuffer([signal_semaphore, wait_semaphore, signal_value, this](vk::CommandBuffer cmdbuf, vk::CommandBuffer upload_cmdbuf) { + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + upload_cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); upload_cmdbuf.End(); cmdbuf.End(); -- cgit v1.2.3 From c9437e524484f4110844c22ae00e20035cc1b3f2 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 15 Nov 2023 01:06:11 +0100 Subject: Query Cache: Disable write syncing on Android --- src/video_core/renderer_vulkan/vk_query_cache.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 66c03bf17..078777cdd 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -211,6 +211,13 @@ public: return; } PauseCounter(); + const auto driver_id = device.GetDriverID(); + if (driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY || + driver_id == VK_DRIVER_ID_ARM_PROPRIETARY || driver_id == VK_DRIVER_ID_MESA_TURNIP) { + pending_sync.clear(); + sync_values_stash.clear(); + return; + } sync_values_stash.clear(); sync_values_stash.emplace_back(); std::vector* sync_values = &sync_values_stash.back(); @@ -1378,6 +1385,12 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku return true; } + auto driver_id = impl->device.GetDriverID(); + if (driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY || + driver_id == VK_DRIVER_ID_ARM_PROPRIETARY || driver_id == VK_DRIVER_ID_MESA_TURNIP) { + return true; + } + for (size_t i = 0; i < 2; i++) { is_null[i] = !is_in_ac[i] && check_value(objects[i]->address); } -- cgit v1.2.3 From 4055a476aafb7b915c649363ccde7ba9b8d864d3 Mon Sep 17 00:00:00 2001 From: Liam Date: Wed, 15 Nov 2023 13:45:07 -0500 Subject: video_core: refactor video frame and packet parsing --- src/video_core/CMakeLists.txt | 4 +- src/video_core/host1x/codecs/codec.cpp | 329 ++----------------------- src/video_core/host1x/codecs/codec.h | 39 +-- src/video_core/host1x/codecs/h264.cpp | 4 +- src/video_core/host1x/codecs/h264.h | 1 + src/video_core/host1x/ffmpeg/ffmpeg.cpp | 419 ++++++++++++++++++++++++++++++++ src/video_core/host1x/ffmpeg/ffmpeg.h | 213 ++++++++++++++++ src/video_core/host1x/nvdec.cpp | 2 +- src/video_core/host1x/nvdec.h | 2 +- src/video_core/host1x/vic.cpp | 62 ++--- src/video_core/host1x/vic.h | 4 +- 11 files changed, 705 insertions(+), 374 deletions(-) create mode 100644 src/video_core/host1x/ffmpeg/ffmpeg.cpp create mode 100644 src/video_core/host1x/ffmpeg/ffmpeg.h (limited to 'src/video_core') diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index cf9266d54..b65b9f2a2 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -4,7 +4,7 @@ add_subdirectory(host_shaders) if(LIBVA_FOUND) - set_source_files_properties(host1x/codecs/codec.cpp + set_source_files_properties(host1x/ffmpeg/ffmpeg.cpp PROPERTIES COMPILE_DEFINITIONS LIBVA_FOUND=1) list(APPEND FFmpeg_LIBRARIES ${LIBVA_LIBRARIES}) endif() @@ -66,6 +66,8 @@ add_library(video_core STATIC host1x/codecs/vp9.cpp host1x/codecs/vp9.h host1x/codecs/vp9_types.h + host1x/ffmpeg/ffmpeg.cpp + host1x/ffmpeg/ffmpeg.h host1x/control.cpp host1x/control.h host1x/host1x.cpp diff --git a/src/video_core/host1x/codecs/codec.cpp b/src/video_core/host1x/codecs/codec.cpp index dbcf508e5..1030db681 100644 --- a/src/video_core/host1x/codecs/codec.cpp +++ b/src/video_core/host1x/codecs/codec.cpp @@ -1,11 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include -#include #include "common/assert.h" -#include "common/scope_exit.h" #include "common/settings.h" #include "video_core/host1x/codecs/codec.h" #include "video_core/host1x/codecs/h264.h" @@ -14,242 +10,17 @@ #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -extern "C" { -#include -#include -#include -#ifdef LIBVA_FOUND -// for querying VAAPI driver information -#include -#endif -} - namespace Tegra { -namespace { -constexpr AVPixelFormat PREFERRED_GPU_FMT = AV_PIX_FMT_NV12; -constexpr AVPixelFormat PREFERRED_CPU_FMT = AV_PIX_FMT_YUV420P; -constexpr std::array PREFERRED_GPU_DECODERS = { - AV_HWDEVICE_TYPE_CUDA, -#ifdef _WIN32 - AV_HWDEVICE_TYPE_D3D11VA, - AV_HWDEVICE_TYPE_DXVA2, -#elif defined(__unix__) - AV_HWDEVICE_TYPE_VAAPI, - AV_HWDEVICE_TYPE_VDPAU, -#endif - // last resort for Linux Flatpak (w/ NVIDIA) - AV_HWDEVICE_TYPE_VULKAN, -}; - -void AVPacketDeleter(AVPacket* ptr) { - av_packet_free(&ptr); -} - -using AVPacketPtr = std::unique_ptr; - -AVPixelFormat GetGpuFormat(AVCodecContext* av_codec_ctx, const AVPixelFormat* pix_fmts) { - for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) { - if (*p == av_codec_ctx->pix_fmt) { - return av_codec_ctx->pix_fmt; - } - } - LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU"); - av_buffer_unref(&av_codec_ctx->hw_device_ctx); - av_codec_ctx->pix_fmt = PREFERRED_CPU_FMT; - return PREFERRED_CPU_FMT; -} - -// List all the currently available hwcontext in ffmpeg -std::vector ListSupportedContexts() { - std::vector contexts{}; - AVHWDeviceType current_device_type = AV_HWDEVICE_TYPE_NONE; - do { - current_device_type = av_hwdevice_iterate_types(current_device_type); - contexts.push_back(current_device_type); - } while (current_device_type != AV_HWDEVICE_TYPE_NONE); - return contexts; -} - -} // namespace - -void AVFrameDeleter(AVFrame* ptr) { - av_frame_free(&ptr); -} Codec::Codec(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs) : host1x(host1x_), state{regs}, h264_decoder(std::make_unique(host1x)), vp8_decoder(std::make_unique(host1x)), vp9_decoder(std::make_unique(host1x)) {} -Codec::~Codec() { - if (!initialized) { - return; - } - // Free libav memory - avcodec_free_context(&av_codec_ctx); - av_buffer_unref(&av_gpu_decoder); - - if (filters_initialized) { - avfilter_graph_free(&av_filter_graph); - } -} - -bool Codec::CreateGpuAvDevice() { - static constexpr auto HW_CONFIG_METHOD = AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX; - static const auto supported_contexts = ListSupportedContexts(); - for (const auto& type : PREFERRED_GPU_DECODERS) { - if (std::none_of(supported_contexts.begin(), supported_contexts.end(), - [&type](const auto& context) { return context == type; })) { - LOG_DEBUG(Service_NVDRV, "{} explicitly unsupported", av_hwdevice_get_type_name(type)); - continue; - } - // Avoid memory leak from not cleaning up after av_hwdevice_ctx_create - av_buffer_unref(&av_gpu_decoder); - const int hwdevice_res = av_hwdevice_ctx_create(&av_gpu_decoder, type, nullptr, nullptr, 0); - if (hwdevice_res < 0) { - LOG_DEBUG(Service_NVDRV, "{} av_hwdevice_ctx_create failed {}", - av_hwdevice_get_type_name(type), hwdevice_res); - continue; - } -#ifdef LIBVA_FOUND - if (type == AV_HWDEVICE_TYPE_VAAPI) { - // we need to determine if this is an impersonated VAAPI driver - AVHWDeviceContext* hwctx = - static_cast(static_cast(av_gpu_decoder->data)); - AVVAAPIDeviceContext* vactx = static_cast(hwctx->hwctx); - const char* vendor_name = vaQueryVendorString(vactx->display); - if (strstr(vendor_name, "VDPAU backend")) { - // VDPAU impersonated VAAPI impl's are super buggy, we need to skip them - LOG_DEBUG(Service_NVDRV, "Skipping vdapu impersonated VAAPI driver"); - continue; - } else { - // according to some user testing, certain vaapi driver (Intel?) could be buggy - // so let's log the driver name which may help the developers/supporters - LOG_DEBUG(Service_NVDRV, "Using VAAPI driver: {}", vendor_name); - } - } -#endif - for (int i = 0;; i++) { - const AVCodecHWConfig* config = avcodec_get_hw_config(av_codec, i); - if (!config) { - LOG_DEBUG(Service_NVDRV, "{} decoder does not support device type {}.", - av_codec->name, av_hwdevice_get_type_name(type)); - break; - } - if ((config->methods & HW_CONFIG_METHOD) != 0 && config->device_type == type) { - LOG_INFO(Service_NVDRV, "Using {} GPU decoder", av_hwdevice_get_type_name(type)); - av_codec_ctx->pix_fmt = config->pix_fmt; - return true; - } - } - } - return false; -} - -void Codec::InitializeAvCodecContext() { - av_codec_ctx = avcodec_alloc_context3(av_codec); - av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); - av_codec_ctx->thread_count = 0; - av_codec_ctx->thread_type &= ~FF_THREAD_FRAME; -} - -void Codec::InitializeGpuDecoder() { - if (!CreateGpuAvDevice()) { - av_buffer_unref(&av_gpu_decoder); - return; - } - auto* hw_device_ctx = av_buffer_ref(av_gpu_decoder); - ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed"); - av_codec_ctx->hw_device_ctx = hw_device_ctx; - av_codec_ctx->get_format = GetGpuFormat; -} - -void Codec::InitializeAvFilters(AVFrame* frame) { - const AVFilter* buffer_src = avfilter_get_by_name("buffer"); - const AVFilter* buffer_sink = avfilter_get_by_name("buffersink"); - AVFilterInOut* inputs = avfilter_inout_alloc(); - AVFilterInOut* outputs = avfilter_inout_alloc(); - SCOPE_EXIT({ - avfilter_inout_free(&inputs); - avfilter_inout_free(&outputs); - }); - - // Don't know how to get the accurate time_base but it doesn't matter for yadif filter - // so just use 1/1 to make buffer filter happy - std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame->width, - frame->height, frame->format); - - av_filter_graph = avfilter_graph_alloc(); - int ret = avfilter_graph_create_filter(&av_filter_src_ctx, buffer_src, "in", args.c_str(), - nullptr, av_filter_graph); - if (ret < 0) { - LOG_ERROR(Service_NVDRV, "avfilter_graph_create_filter source error: {}", ret); - return; - } - - ret = avfilter_graph_create_filter(&av_filter_sink_ctx, buffer_sink, "out", nullptr, nullptr, - av_filter_graph); - if (ret < 0) { - LOG_ERROR(Service_NVDRV, "avfilter_graph_create_filter sink error: {}", ret); - return; - } - - inputs->name = av_strdup("out"); - inputs->filter_ctx = av_filter_sink_ctx; - inputs->pad_idx = 0; - inputs->next = nullptr; - - outputs->name = av_strdup("in"); - outputs->filter_ctx = av_filter_src_ctx; - outputs->pad_idx = 0; - outputs->next = nullptr; - - const char* description = "yadif=1:-1:0"; - ret = avfilter_graph_parse_ptr(av_filter_graph, description, &inputs, &outputs, nullptr); - if (ret < 0) { - LOG_ERROR(Service_NVDRV, "avfilter_graph_parse_ptr error: {}", ret); - return; - } - - ret = avfilter_graph_config(av_filter_graph, nullptr); - if (ret < 0) { - LOG_ERROR(Service_NVDRV, "avfilter_graph_config error: {}", ret); - return; - } - - filters_initialized = true; -} +Codec::~Codec() = default; void Codec::Initialize() { - const AVCodecID codec = [&] { - switch (current_codec) { - case Host1x::NvdecCommon::VideoCodec::H264: - return AV_CODEC_ID_H264; - case Host1x::NvdecCommon::VideoCodec::VP8: - return AV_CODEC_ID_VP8; - case Host1x::NvdecCommon::VideoCodec::VP9: - return AV_CODEC_ID_VP9; - default: - UNIMPLEMENTED_MSG("Unknown codec {}", current_codec); - return AV_CODEC_ID_NONE; - } - }(); - av_codec = avcodec_find_decoder(codec); - - InitializeAvCodecContext(); - if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Gpu) { - InitializeGpuDecoder(); - } - if (const int res = avcodec_open2(av_codec_ctx, av_codec, nullptr); res < 0) { - LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed with result {}", res); - avcodec_free_context(&av_codec_ctx); - av_buffer_unref(&av_gpu_decoder); - return; - } - if (!av_codec_ctx->hw_device_ctx) { - LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding"); - } - initialized = true; + initialized = decode_api.Initialize(current_codec); } void Codec::SetTargetCodec(Host1x::NvdecCommon::VideoCodec codec) { @@ -264,14 +35,18 @@ void Codec::Decode() { if (is_first_frame) { Initialize(); } + if (!initialized) { return; } + + // Assemble bitstream. bool vp9_hidden_frame = false; - const auto& frame_data = [&]() { + size_t configuration_size = 0; + const auto packet_data = [&]() { switch (current_codec) { case Tegra::Host1x::NvdecCommon::VideoCodec::H264: - return h264_decoder->ComposeFrame(state, is_first_frame); + return h264_decoder->ComposeFrame(state, &configuration_size, is_first_frame); case Tegra::Host1x::NvdecCommon::VideoCodec::VP8: return vp8_decoder->ComposeFrame(state); case Tegra::Host1x::NvdecCommon::VideoCodec::VP9: @@ -283,89 +58,35 @@ void Codec::Decode() { return std::span{}; } }(); - AVPacketPtr packet{av_packet_alloc(), AVPacketDeleter}; - if (!packet) { - LOG_ERROR(Service_NVDRV, "av_packet_alloc failed"); - return; - } - packet->data = const_cast(frame_data.data()); - packet->size = static_cast(frame_data.size()); - if (const int res = avcodec_send_packet(av_codec_ctx, packet.get()); res != 0) { - LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", res); + + // Send assembled bitstream to decoder. + if (!decode_api.SendPacket(packet_data, configuration_size)) { return; } - // Only receive/store visible frames + + // Only receive/store visible frames. if (vp9_hidden_frame) { return; } - AVFramePtr initial_frame{av_frame_alloc(), AVFrameDeleter}; - AVFramePtr final_frame{nullptr, AVFrameDeleter}; - ASSERT_MSG(initial_frame, "av_frame_alloc initial_frame failed"); - if (const int ret = avcodec_receive_frame(av_codec_ctx, initial_frame.get()); ret) { - LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret); - return; - } - if (initial_frame->width == 0 || initial_frame->height == 0) { - LOG_WARNING(Service_NVDRV, "Zero width or height in frame"); - return; - } - bool is_interlaced = initial_frame->interlaced_frame != 0; - if (av_codec_ctx->hw_device_ctx) { - final_frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter}; - ASSERT_MSG(final_frame, "av_frame_alloc final_frame failed"); - // Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp - // because Intel drivers crash unless using AV_PIX_FMT_NV12 - final_frame->format = PREFERRED_GPU_FMT; - const int ret = av_hwframe_transfer_data(final_frame.get(), initial_frame.get(), 0); - ASSERT_MSG(!ret, "av_hwframe_transfer_data error {}", ret); - } else { - final_frame = std::move(initial_frame); - } - if (final_frame->format != PREFERRED_CPU_FMT && final_frame->format != PREFERRED_GPU_FMT) { - UNIMPLEMENTED_MSG("Unexpected video format: {}", final_frame->format); - return; - } - if (!is_interlaced) { - av_frames.push(std::move(final_frame)); - } else { - if (!filters_initialized) { - InitializeAvFilters(final_frame.get()); - } - if (const int ret = av_buffersrc_add_frame_flags(av_filter_src_ctx, final_frame.get(), - AV_BUFFERSRC_FLAG_KEEP_REF); - ret) { - LOG_DEBUG(Service_NVDRV, "av_buffersrc_add_frame_flags error {}", ret); - return; - } - while (true) { - auto filter_frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter}; - int ret = av_buffersink_get_frame(av_filter_sink_ctx, filter_frame.get()); + // Receive output frames from decoder. + decode_api.ReceiveFrames(frames); - if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) - break; - if (ret < 0) { - LOG_DEBUG(Service_NVDRV, "av_buffersink_get_frame error {}", ret); - return; - } - - av_frames.push(std::move(filter_frame)); - } - } - while (av_frames.size() > 10) { - LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame"); - av_frames.pop(); + while (frames.size() > 10) { + LOG_DEBUG(HW_GPU, "ReceiveFrames overflow, dropped frame"); + frames.pop(); } } -AVFramePtr Codec::GetCurrentFrame() { +std::unique_ptr Codec::GetCurrentFrame() { // Sometimes VIC will request more frames than have been decoded. - // in this case, return a nullptr and don't overwrite previous frame data - if (av_frames.empty()) { - return AVFramePtr{nullptr, AVFrameDeleter}; + // in this case, return a blank frame and don't overwrite previous data. + if (frames.empty()) { + return {}; } - AVFramePtr frame = std::move(av_frames.front()); - av_frames.pop(); + + auto frame = std::move(frames.front()); + frames.pop(); return frame; } diff --git a/src/video_core/host1x/codecs/codec.h b/src/video_core/host1x/codecs/codec.h index 06fe00a4b..f700ae129 100644 --- a/src/video_core/host1x/codecs/codec.h +++ b/src/video_core/host1x/codecs/codec.h @@ -4,28 +4,15 @@ #pragma once #include +#include #include #include #include "common/common_types.h" +#include "video_core/host1x/ffmpeg/ffmpeg.h" #include "video_core/host1x/nvdec_common.h" -extern "C" { -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wconversion" -#endif -#include -#include -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic pop -#endif -} - namespace Tegra { -void AVFrameDeleter(AVFrame* ptr); -using AVFramePtr = std::unique_ptr; - namespace Decoder { class H264; class VP8; @@ -51,7 +38,7 @@ public: void Decode(); /// Returns next decoded frame - [[nodiscard]] AVFramePtr GetCurrentFrame(); + [[nodiscard]] std::unique_ptr GetCurrentFrame(); /// Returns the value of current_codec [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const; @@ -60,25 +47,9 @@ public: [[nodiscard]] std::string_view GetCurrentCodecName() const; private: - void InitializeAvCodecContext(); - - void InitializeAvFilters(AVFrame* frame); - - void InitializeGpuDecoder(); - - bool CreateGpuAvDevice(); - bool initialized{}; - bool filters_initialized{}; Host1x::NvdecCommon::VideoCodec current_codec{Host1x::NvdecCommon::VideoCodec::None}; - - const AVCodec* av_codec{nullptr}; - AVCodecContext* av_codec_ctx{nullptr}; - AVBufferRef* av_gpu_decoder{nullptr}; - - AVFilterContext* av_filter_src_ctx{nullptr}; - AVFilterContext* av_filter_sink_ctx{nullptr}; - AVFilterGraph* av_filter_graph{nullptr}; + FFmpeg::DecodeApi decode_api; Host1x::Host1x& host1x; const Host1x::NvdecCommon::NvdecRegisters& state; @@ -86,7 +57,7 @@ private: std::unique_ptr vp8_decoder; std::unique_ptr vp9_decoder; - std::queue av_frames{}; + std::queue> frames{}; }; } // namespace Tegra diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index ece79b1e2..309a7f1d5 100644 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -30,7 +30,7 @@ H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {} H264::~H264() = default; std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, - bool is_first_frame) { + size_t* out_configuration_size, bool is_first_frame) { H264DecoderContext context; host1x.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); @@ -39,6 +39,7 @@ std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters if (!is_first_frame && frame_number != 0) { frame.resize_destructive(context.stream_len); host1x.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); + *out_configuration_size = 0; return frame; } @@ -157,6 +158,7 @@ std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters frame.resize(encoded_header.size() + context.stream_len); std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); + *out_configuration_size = encoded_header.size(); host1x.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data() + encoded_header.size(), context.stream_len); diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h index d6b556322..1deaf4632 100644 --- a/src/video_core/host1x/codecs/h264.h +++ b/src/video_core/host1x/codecs/h264.h @@ -67,6 +67,7 @@ public: /// Compose the H264 frame for FFmpeg decoding [[nodiscard]] std::span ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, + size_t* out_configuration_size, bool is_first_frame = false); private: diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.cpp b/src/video_core/host1x/ffmpeg/ffmpeg.cpp new file mode 100644 index 000000000..dcd07e6d2 --- /dev/null +++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp @@ -0,0 +1,419 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/scope_exit.h" +#include "common/settings.h" +#include "video_core/host1x/ffmpeg/ffmpeg.h" + +extern "C" { +#ifdef LIBVA_FOUND +// for querying VAAPI driver information +#include +#endif +} + +namespace FFmpeg { + +namespace { + +constexpr AVPixelFormat PreferredGpuFormat = AV_PIX_FMT_NV12; +constexpr AVPixelFormat PreferredCpuFormat = AV_PIX_FMT_YUV420P; +constexpr std::array PreferredGpuDecoders = { + AV_HWDEVICE_TYPE_CUDA, +#ifdef _WIN32 + AV_HWDEVICE_TYPE_D3D11VA, + AV_HWDEVICE_TYPE_DXVA2, +#elif defined(__unix__) + AV_HWDEVICE_TYPE_VAAPI, + AV_HWDEVICE_TYPE_VDPAU, +#endif + // last resort for Linux Flatpak (w/ NVIDIA) + AV_HWDEVICE_TYPE_VULKAN, +}; + +AVPixelFormat GetGpuFormat(AVCodecContext* codec_context, const AVPixelFormat* pix_fmts) { + for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) { + if (*p == codec_context->pix_fmt) { + return codec_context->pix_fmt; + } + } + + LOG_INFO(HW_GPU, "Could not find compatible GPU AV format, falling back to CPU"); + av_buffer_unref(&codec_context->hw_device_ctx); + + codec_context->pix_fmt = PreferredCpuFormat; + return codec_context->pix_fmt; +} + +std::string AVError(int errnum) { + char errbuf[AV_ERROR_MAX_STRING_SIZE] = {}; + av_make_error_string(errbuf, sizeof(errbuf) - 1, errnum); + return errbuf; +} + +} // namespace + +Packet::Packet(std::span data) { + m_packet = av_packet_alloc(); + m_packet->data = const_cast(data.data()); + m_packet->size = static_cast(data.size()); +} + +Packet::~Packet() { + av_packet_free(&m_packet); +} + +Frame::Frame() { + m_frame = av_frame_alloc(); +} + +Frame::~Frame() { + av_frame_free(&m_frame); +} + +Decoder::Decoder(Tegra::Host1x::NvdecCommon::VideoCodec codec) { + const AVCodecID av_codec = [&] { + switch (codec) { + case Tegra::Host1x::NvdecCommon::VideoCodec::H264: + return AV_CODEC_ID_H264; + case Tegra::Host1x::NvdecCommon::VideoCodec::VP8: + return AV_CODEC_ID_VP8; + case Tegra::Host1x::NvdecCommon::VideoCodec::VP9: + return AV_CODEC_ID_VP9; + default: + UNIMPLEMENTED_MSG("Unknown codec {}", codec); + return AV_CODEC_ID_NONE; + } + }(); + + m_codec = avcodec_find_decoder(av_codec); +} + +bool Decoder::SupportsDecodingOnDevice(AVPixelFormat* out_pix_fmt, AVHWDeviceType type) const { + for (int i = 0;; i++) { + const AVCodecHWConfig* config = avcodec_get_hw_config(m_codec, i); + if (!config) { + LOG_DEBUG(HW_GPU, "{} decoder does not support device type {}", m_codec->name, + av_hwdevice_get_type_name(type)); + break; + } + if ((config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX) != 0 && + config->device_type == type) { + LOG_INFO(HW_GPU, "Using {} GPU decoder", av_hwdevice_get_type_name(type)); + *out_pix_fmt = config->pix_fmt; + return true; + } + } + + return false; +} + +std::vector HardwareContext::GetSupportedDeviceTypes() { + std::vector types; + AVHWDeviceType current_device_type = AV_HWDEVICE_TYPE_NONE; + + while (true) { + current_device_type = av_hwdevice_iterate_types(current_device_type); + if (current_device_type == AV_HWDEVICE_TYPE_NONE) { + return types; + } + + types.push_back(current_device_type); + } +} + +HardwareContext::~HardwareContext() { + av_buffer_unref(&m_gpu_decoder); +} + +bool HardwareContext::InitializeForDecoder(DecoderContext& decoder_context, + const Decoder& decoder) { + const auto supported_types = GetSupportedDeviceTypes(); + for (const auto type : PreferredGpuDecoders) { + AVPixelFormat hw_pix_fmt; + + if (std::ranges::find(supported_types, type) == supported_types.end()) { + LOG_DEBUG(HW_GPU, "{} explicitly unsupported", av_hwdevice_get_type_name(type)); + continue; + } + + if (!this->InitializeWithType(type)) { + continue; + } + + if (decoder.SupportsDecodingOnDevice(&hw_pix_fmt, type)) { + decoder_context.InitializeHardwareDecoder(*this, hw_pix_fmt); + return true; + } + } + + return false; +} + +bool HardwareContext::InitializeWithType(AVHWDeviceType type) { + av_buffer_unref(&m_gpu_decoder); + + if (const int ret = av_hwdevice_ctx_create(&m_gpu_decoder, type, nullptr, nullptr, 0); + ret < 0) { + LOG_DEBUG(HW_GPU, "av_hwdevice_ctx_create({}) failed: {}", av_hwdevice_get_type_name(type), + AVError(ret)); + return false; + } + +#ifdef LIBVA_FOUND + if (type == AV_HWDEVICE_TYPE_VAAPI) { + // We need to determine if this is an impersonated VAAPI driver. + auto* hwctx = reinterpret_cast(m_gpu_decoder->data); + auto* vactx = static_cast(hwctx->hwctx); + const char* vendor_name = vaQueryVendorString(vactx->display); + if (strstr(vendor_name, "VDPAU backend")) { + // VDPAU impersonated VAAPI impls are super buggy, we need to skip them. + LOG_DEBUG(HW_GPU, "Skipping VDPAU impersonated VAAPI driver"); + return false; + } else { + // According to some user testing, certain VAAPI drivers (Intel?) could be buggy. + // Log the driver name just in case. + LOG_DEBUG(HW_GPU, "Using VAAPI driver: {}", vendor_name); + } + } +#endif + + return true; +} + +DecoderContext::DecoderContext(const Decoder& decoder) { + m_codec_context = avcodec_alloc_context3(decoder.GetCodec()); + av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0); + m_codec_context->thread_count = 0; + m_codec_context->thread_type &= ~FF_THREAD_FRAME; +} + +DecoderContext::~DecoderContext() { + av_buffer_unref(&m_codec_context->hw_device_ctx); + avcodec_free_context(&m_codec_context); +} + +void DecoderContext::InitializeHardwareDecoder(const HardwareContext& context, + AVPixelFormat hw_pix_fmt) { + m_codec_context->hw_device_ctx = av_buffer_ref(context.GetBufferRef()); + m_codec_context->get_format = GetGpuFormat; + m_codec_context->pix_fmt = hw_pix_fmt; +} + +bool DecoderContext::OpenContext(const Decoder& decoder) { + if (const int ret = avcodec_open2(m_codec_context, decoder.GetCodec(), nullptr); ret < 0) { + LOG_ERROR(HW_GPU, "avcodec_open2 error: {}", AVError(ret)); + return false; + } + + if (!m_codec_context->hw_device_ctx) { + LOG_INFO(HW_GPU, "Using FFmpeg software decoding"); + } + + return true; +} + +bool DecoderContext::SendPacket(const Packet& packet) { + if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) { + LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret)); + return false; + } + + return true; +} + +std::unique_ptr DecoderContext::ReceiveFrame(bool* out_is_interlaced) { + auto dst_frame = std::make_unique(); + + const auto ReceiveImpl = [&](AVFrame* frame) { + if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) { + LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret)); + return false; + } + + *out_is_interlaced = frame->interlaced_frame != 0; + return true; + }; + + if (m_codec_context->hw_device_ctx) { + // If we have a hardware context, make a separate frame here to receive the + // hardware result before sending it to the output. + Frame intermediate_frame; + + if (!ReceiveImpl(intermediate_frame.GetFrame())) { + return {}; + } + + dst_frame->SetFormat(PreferredGpuFormat); + if (const int ret = + av_hwframe_transfer_data(dst_frame->GetFrame(), intermediate_frame.GetFrame(), 0); + ret < 0) { + LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret)); + return {}; + } + } else { + // Otherwise, decode the frame as normal. + if (!ReceiveImpl(dst_frame->GetFrame())) { + return {}; + } + } + + return dst_frame; +} + +DeinterlaceFilter::DeinterlaceFilter(const Frame& frame) { + const AVFilter* buffer_src = avfilter_get_by_name("buffer"); + const AVFilter* buffer_sink = avfilter_get_by_name("buffersink"); + AVFilterInOut* inputs = avfilter_inout_alloc(); + AVFilterInOut* outputs = avfilter_inout_alloc(); + SCOPE_EXIT({ + avfilter_inout_free(&inputs); + avfilter_inout_free(&outputs); + }); + + // Don't know how to get the accurate time_base but it doesn't matter for yadif filter + // so just use 1/1 to make buffer filter happy + std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame.GetWidth(), + frame.GetHeight(), static_cast(frame.GetPixelFormat())); + + m_filter_graph = avfilter_graph_alloc(); + int ret = avfilter_graph_create_filter(&m_source_context, buffer_src, "in", args.c_str(), + nullptr, m_filter_graph); + if (ret < 0) { + LOG_ERROR(HW_GPU, "avfilter_graph_create_filter source error: {}", AVError(ret)); + return; + } + + ret = avfilter_graph_create_filter(&m_sink_context, buffer_sink, "out", nullptr, nullptr, + m_filter_graph); + if (ret < 0) { + LOG_ERROR(HW_GPU, "avfilter_graph_create_filter sink error: {}", AVError(ret)); + return; + } + + inputs->name = av_strdup("out"); + inputs->filter_ctx = m_sink_context; + inputs->pad_idx = 0; + inputs->next = nullptr; + + outputs->name = av_strdup("in"); + outputs->filter_ctx = m_source_context; + outputs->pad_idx = 0; + outputs->next = nullptr; + + const char* description = "yadif=1:-1:0"; + ret = avfilter_graph_parse_ptr(m_filter_graph, description, &inputs, &outputs, nullptr); + if (ret < 0) { + LOG_ERROR(HW_GPU, "avfilter_graph_parse_ptr error: {}", AVError(ret)); + return; + } + + ret = avfilter_graph_config(m_filter_graph, nullptr); + if (ret < 0) { + LOG_ERROR(HW_GPU, "avfilter_graph_config error: {}", AVError(ret)); + return; + } + + m_initialized = true; +} + +bool DeinterlaceFilter::AddSourceFrame(const Frame& frame) { + if (const int ret = av_buffersrc_add_frame_flags(m_source_context, frame.GetFrame(), + AV_BUFFERSRC_FLAG_KEEP_REF); + ret < 0) { + LOG_ERROR(HW_GPU, "av_buffersrc_add_frame_flags error: {}", AVError(ret)); + return false; + } + + return true; +} + +std::unique_ptr DeinterlaceFilter::DrainSinkFrame() { + auto dst_frame = std::make_unique(); + const int ret = av_buffersink_get_frame(m_sink_context, dst_frame->GetFrame()); + + if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) { + return {}; + } + + if (ret < 0) { + LOG_ERROR(HW_GPU, "av_buffersink_get_frame error: {}", AVError(ret)); + return {}; + } + + return dst_frame; +} + +DeinterlaceFilter::~DeinterlaceFilter() { + avfilter_graph_free(&m_filter_graph); +} + +void DecodeApi::Reset() { + m_deinterlace_filter.reset(); + m_hardware_context.reset(); + m_decoder_context.reset(); + m_decoder.reset(); +} + +bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) { + this->Reset(); + m_decoder.emplace(codec); + m_decoder_context.emplace(*m_decoder); + + // Enable GPU decoding if requested. + if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Gpu) { + m_hardware_context.emplace(); + m_hardware_context->InitializeForDecoder(*m_decoder_context, *m_decoder); + } + + // Open the decoder context. + if (!m_decoder_context->OpenContext(*m_decoder)) { + this->Reset(); + return false; + } + + return true; +} + +bool DecodeApi::SendPacket(std::span packet_data, size_t configuration_size) { + FFmpeg::Packet packet(packet_data); + return m_decoder_context->SendPacket(packet); +} + +void DecodeApi::ReceiveFrames(std::queue>& frame_queue) { + // Receive raw frame from decoder. + bool is_interlaced; + auto frame = m_decoder_context->ReceiveFrame(&is_interlaced); + if (!frame) { + return; + } + + if (!is_interlaced) { + // If the frame is not interlaced, we can pend it now. + frame_queue.push(std::move(frame)); + } else { + // Create the deinterlacer if needed. + if (!m_deinterlace_filter) { + m_deinterlace_filter.emplace(*frame); + } + + // Add the frame we just received. + if (!m_deinterlace_filter->AddSourceFrame(*frame)) { + return; + } + + // Pend output fields. + while (true) { + auto filter_frame = m_deinterlace_filter->DrainSinkFrame(); + if (!filter_frame) { + break; + } + + frame_queue.push(std::move(filter_frame)); + } + } +} + +} // namespace FFmpeg diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.h b/src/video_core/host1x/ffmpeg/ffmpeg.h new file mode 100644 index 000000000..1de0bbd83 --- /dev/null +++ b/src/video_core/host1x/ffmpeg/ffmpeg.h @@ -0,0 +1,213 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include +#include + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/host1x/nvdec_common.h" + +extern "C" { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#include +#include +#include +#include +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +namespace FFmpeg { + +class Packet; +class Frame; +class Decoder; +class HardwareContext; +class DecoderContext; +class DeinterlaceFilter; + +// Wraps an AVPacket, a container for compressed bitstream data. +class Packet { +public: + YUZU_NON_COPYABLE(Packet); + YUZU_NON_MOVEABLE(Packet); + + explicit Packet(std::span data); + ~Packet(); + + AVPacket* GetPacket() const { + return m_packet; + } + +private: + AVPacket* m_packet{}; +}; + +// Wraps an AVFrame, a container for audio and video stream data. +class Frame { +public: + YUZU_NON_COPYABLE(Frame); + YUZU_NON_MOVEABLE(Frame); + + explicit Frame(); + ~Frame(); + + int GetWidth() const { + return m_frame->width; + } + + int GetHeight() const { + return m_frame->height; + } + + AVPixelFormat GetPixelFormat() const { + return static_cast(m_frame->format); + } + + int GetStride(int plane) const { + return m_frame->linesize[plane]; + } + + int* GetStrides() const { + return m_frame->linesize; + } + + u8* GetData(int plane) const { + return m_frame->data[plane]; + } + + u8** GetPlanes() const { + return m_frame->data; + } + + void SetFormat(int format) { + m_frame->format = format; + } + + AVFrame* GetFrame() const { + return m_frame; + } + +private: + AVFrame* m_frame{}; +}; + +// Wraps an AVCodec, a type containing information about a codec. +class Decoder { +public: + YUZU_NON_COPYABLE(Decoder); + YUZU_NON_MOVEABLE(Decoder); + + explicit Decoder(Tegra::Host1x::NvdecCommon::VideoCodec codec); + ~Decoder() = default; + + bool SupportsDecodingOnDevice(AVPixelFormat* out_pix_fmt, AVHWDeviceType type) const; + + const AVCodec* GetCodec() const { + return m_codec; + } + +private: + const AVCodec* m_codec{}; +}; + +// Wraps AVBufferRef for an accelerated decoder. +class HardwareContext { +public: + YUZU_NON_COPYABLE(HardwareContext); + YUZU_NON_MOVEABLE(HardwareContext); + + static std::vector GetSupportedDeviceTypes(); + + explicit HardwareContext() = default; + ~HardwareContext(); + + bool InitializeForDecoder(DecoderContext& decoder_context, const Decoder& decoder); + + AVBufferRef* GetBufferRef() const { + return m_gpu_decoder; + } + +private: + bool InitializeWithType(AVHWDeviceType type); + + AVBufferRef* m_gpu_decoder{}; +}; + +// Wraps an AVCodecContext. +class DecoderContext { +public: + YUZU_NON_COPYABLE(DecoderContext); + YUZU_NON_MOVEABLE(DecoderContext); + + explicit DecoderContext(const Decoder& decoder); + ~DecoderContext(); + + void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt); + bool OpenContext(const Decoder& decoder); + bool SendPacket(const Packet& packet); + std::unique_ptr ReceiveFrame(bool* out_is_interlaced); + + AVCodecContext* GetCodecContext() const { + return m_codec_context; + } + +private: + AVCodecContext* m_codec_context{}; +}; + +// Wraps an AVFilterGraph. +class DeinterlaceFilter { +public: + YUZU_NON_COPYABLE(DeinterlaceFilter); + YUZU_NON_MOVEABLE(DeinterlaceFilter); + + explicit DeinterlaceFilter(const Frame& frame); + ~DeinterlaceFilter(); + + bool AddSourceFrame(const Frame& frame); + std::unique_ptr DrainSinkFrame(); + +private: + AVFilterGraph* m_filter_graph{}; + AVFilterContext* m_source_context{}; + AVFilterContext* m_sink_context{}; + bool m_initialized{}; +}; + +class DecodeApi { +public: + YUZU_NON_COPYABLE(DecodeApi); + YUZU_NON_MOVEABLE(DecodeApi); + + DecodeApi() = default; + ~DecodeApi() = default; + + bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec); + void Reset(); + + bool SendPacket(std::span packet_data, size_t configuration_size); + void ReceiveFrames(std::queue>& frame_queue); + +private: + std::optional m_decoder; + std::optional m_decoder_context; + std::optional m_hardware_context; + std::optional m_deinterlace_filter; +}; + +} // namespace FFmpeg diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp index a4bd5b79f..b8f5866d3 100644 --- a/src/video_core/host1x/nvdec.cpp +++ b/src/video_core/host1x/nvdec.cpp @@ -28,7 +28,7 @@ void Nvdec::ProcessMethod(u32 method, u32 argument) { } } -AVFramePtr Nvdec::GetFrame() { +std::unique_ptr Nvdec::GetFrame() { return codec->GetCurrentFrame(); } diff --git a/src/video_core/host1x/nvdec.h b/src/video_core/host1x/nvdec.h index 3949d5181..ddddb8d28 100644 --- a/src/video_core/host1x/nvdec.h +++ b/src/video_core/host1x/nvdec.h @@ -23,7 +23,7 @@ public: void ProcessMethod(u32 method, u32 argument); /// Return most recently decoded frame - [[nodiscard]] AVFramePtr GetFrame(); + [[nodiscard]] std::unique_ptr GetFrame(); private: /// Invoke codec to decode a frame diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp index 10d7ef884..2a5eba415 100644 --- a/src/video_core/host1x/vic.cpp +++ b/src/video_core/host1x/vic.cpp @@ -82,27 +82,26 @@ void Vic::Execute() { return; } const VicConfig config{host1x.MemoryManager().Read(config_struct_address + 0x20)}; - const AVFramePtr frame_ptr = nvdec_processor->GetFrame(); - const auto* frame = frame_ptr.get(); + auto frame = nvdec_processor->GetFrame(); if (!frame) { return; } const u64 surface_width = config.surface_width_minus1 + 1; const u64 surface_height = config.surface_height_minus1 + 1; - if (static_cast(frame->width) != surface_width || - static_cast(frame->height) != surface_height) { + if (static_cast(frame->GetWidth()) != surface_width || + static_cast(frame->GetHeight()) != surface_height) { // TODO: Properly support multiple video streams with differing frame dimensions LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}", - frame->width, frame->height, surface_width, surface_height); + frame->GetWidth(), frame->GetHeight(), surface_width, surface_height); } switch (config.pixel_format) { case VideoPixelFormat::RGBA8: case VideoPixelFormat::BGRA8: case VideoPixelFormat::RGBX8: - WriteRGBFrame(frame, config); + WriteRGBFrame(std::move(frame), config); break; case VideoPixelFormat::YUV420: - WriteYUVFrame(frame, config); + WriteYUVFrame(std::move(frame), config); break; default: UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value()); @@ -110,10 +109,14 @@ void Vic::Execute() { } } -void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) { +void Vic::WriteRGBFrame(std::unique_ptr frame, const VicConfig& config) { LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); - if (!scaler_ctx || frame->width != scaler_width || frame->height != scaler_height) { + const auto frame_width = frame->GetWidth(); + const auto frame_height = frame->GetHeight(); + const auto frame_format = frame->GetPixelFormat(); + + if (!scaler_ctx || frame_width != scaler_width || frame_height != scaler_height) { const AVPixelFormat target_format = [pixel_format = config.pixel_format]() { switch (pixel_format) { case VideoPixelFormat::RGBA8: @@ -129,27 +132,26 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) { sws_freeContext(scaler_ctx); // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format - scaler_ctx = sws_getContext(frame->width, frame->height, - static_cast(frame->format), frame->width, - frame->height, target_format, 0, nullptr, nullptr, nullptr); - scaler_width = frame->width; - scaler_height = frame->height; + scaler_ctx = sws_getContext(frame_width, frame_height, frame_format, frame_width, + frame_height, target_format, 0, nullptr, nullptr, nullptr); + scaler_width = frame_width; + scaler_height = frame_height; converted_frame_buffer.reset(); } if (!converted_frame_buffer) { - const size_t frame_size = frame->width * frame->height * 4; + const size_t frame_size = frame_width * frame_height * 4; converted_frame_buffer = AVMallocPtr{static_cast(av_malloc(frame_size)), av_free}; } - const std::array converted_stride{frame->width * 4, frame->height * 4, 0, 0}; + const std::array converted_stride{frame_width * 4, frame_height * 4, 0, 0}; u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; - sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, &converted_frame_buf_addr, - converted_stride.data()); + sws_scale(scaler_ctx, frame->GetPlanes(), frame->GetStrides(), 0, frame_height, + &converted_frame_buf_addr, converted_stride.data()); // Use the minimum of surface/frame dimensions to avoid buffer overflow. const u32 surface_width = static_cast(config.surface_width_minus1) + 1; const u32 surface_height = static_cast(config.surface_height_minus1) + 1; - const u32 width = std::min(surface_width, static_cast(frame->width)); - const u32 height = std::min(surface_height, static_cast(frame->height)); + const u32 width = std::min(surface_width, static_cast(frame_width)); + const u32 height = std::min(surface_height, static_cast(frame_height)); const u32 blk_kind = static_cast(config.block_linear_kind); if (blk_kind != 0) { // swizzle pitch linear to block linear @@ -169,23 +171,23 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) { } } -void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) { +void Vic::WriteYUVFrame(std::unique_ptr frame, const VicConfig& config) { LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); const std::size_t surface_width = config.surface_width_minus1 + 1; const std::size_t surface_height = config.surface_height_minus1 + 1; const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; // Use the minimum of surface/frame dimensions to avoid buffer overflow. - const auto frame_width = std::min(surface_width, static_cast(frame->width)); - const auto frame_height = std::min(surface_height, static_cast(frame->height)); + const auto frame_width = std::min(surface_width, static_cast(frame->GetWidth())); + const auto frame_height = std::min(surface_height, static_cast(frame->GetHeight())); - const auto stride = static_cast(frame->linesize[0]); + const auto stride = static_cast(frame->GetStride(0)); luma_buffer.resize_destructive(aligned_width * surface_height); chroma_buffer.resize_destructive(aligned_width * surface_height / 2); // Populate luma buffer - const u8* luma_src = frame->data[0]; + const u8* luma_src = frame->GetData(0); for (std::size_t y = 0; y < frame_height; ++y) { const std::size_t src = y * stride; const std::size_t dst = y * aligned_width; @@ -196,16 +198,16 @@ void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) { // Chroma const std::size_t half_height = frame_height / 2; - const auto half_stride = static_cast(frame->linesize[1]); + const auto half_stride = static_cast(frame->GetStride(1)); - switch (frame->format) { + switch (frame->GetPixelFormat()) { case AV_PIX_FMT_YUV420P: { // Frame from FFmpeg software // Populate chroma buffer from both channels with interleaving. const std::size_t half_width = frame_width / 2; u8* chroma_buffer_data = chroma_buffer.data(); - const u8* chroma_b_src = frame->data[1]; - const u8* chroma_r_src = frame->data[2]; + const u8* chroma_b_src = frame->GetData(1); + const u8* chroma_r_src = frame->GetData(2); for (std::size_t y = 0; y < half_height; ++y) { const std::size_t src = y * half_stride; const std::size_t dst = y * aligned_width; @@ -219,7 +221,7 @@ void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) { case AV_PIX_FMT_NV12: { // Frame from VA-API hardware // This is already interleaved so just copy - const u8* chroma_src = frame->data[1]; + const u8* chroma_src = frame->GetData(1); for (std::size_t y = 0; y < half_height; ++y) { const std::size_t src = y * stride; const std::size_t dst = y * aligned_width; diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h index 3d9753047..6c868f062 100644 --- a/src/video_core/host1x/vic.h +++ b/src/video_core/host1x/vic.h @@ -39,9 +39,9 @@ public: private: void Execute(); - void WriteRGBFrame(const AVFrame* frame, const VicConfig& config); + void WriteRGBFrame(std::unique_ptr frame, const VicConfig& config); - void WriteYUVFrame(const AVFrame* frame, const VicConfig& config); + void WriteYUVFrame(std::unique_ptr frame, const VicConfig& config); Host1x& host1x; std::shared_ptr nvdec_processor; -- cgit v1.2.3 From c67644f1dae45fbcf621ba88bbce822b8637574e Mon Sep 17 00:00:00 2001 From: Ameer J <52414509+ameerj@users.noreply.github.com> Date: Sat, 18 Nov 2023 00:44:05 -0500 Subject: gl_graphics_pipeline: GLASM: Fix transform feedback attribs buffer mode GL_SEPARATE_ATTRIBS only applies when multiple buffers are being used, else GL_INTERLEAVED_ATTRIBS handles the cases for a single buffer with potentially more than one attribute --- src/video_core/renderer_opengl/gl_graphics_pipeline.cpp | 6 +++++- src/video_core/renderer_opengl/gl_graphics_pipeline.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index 44a771d65..af0a453ee 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -559,7 +559,9 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } void GraphicsPipeline::ConfigureTransformFeedbackImpl() const { - glTransformFeedbackAttribsNV(num_xfb_attribs, xfb_attribs.data(), GL_SEPARATE_ATTRIBS); + const GLenum buffer_mode = + num_xfb_buffers_active == 1 ? GL_INTERLEAVED_ATTRIBS : GL_SEPARATE_ATTRIBS; + glTransformFeedbackAttribsNV(num_xfb_attribs, xfb_attribs.data(), buffer_mode); } void GraphicsPipeline::GenerateTransformFeedbackState() { @@ -567,12 +569,14 @@ void GraphicsPipeline::GenerateTransformFeedbackState() { // when this is required. GLint* cursor{xfb_attribs.data()}; + num_xfb_buffers_active = 0; for (size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) { const auto& layout = key.xfb_state.layouts[feedback]; UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding"); if (layout.varying_count == 0) { continue; } + num_xfb_buffers_active++; const auto& locations = key.xfb_state.varyings[feedback]; std::optional current_index; diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h index 74fc9cc3d..2f70c1ae9 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h @@ -154,6 +154,7 @@ private: static constexpr std::size_t XFB_ENTRY_STRIDE = 3; GLsizei num_xfb_attribs{}; + u32 num_xfb_buffers_active{}; std::array xfb_attribs{}; std::mutex built_mutex; -- cgit v1.2.3 From feb60de5c33a394b4998a45a549b99e7f0a4fe18 Mon Sep 17 00:00:00 2001 From: Ameer J <52414509+ameerj@users.noreply.github.com> Date: Sat, 18 Nov 2023 07:39:47 -0500 Subject: shader_recompiler: Fix spelling of "derivate" (#12067) --- src/video_core/engines/fermi_2d.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 02e161270..91f10aec2 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -72,7 +72,7 @@ void Fermi2D::Blit() { UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled"); const auto& args = regs.pixels_from_memory; - constexpr s64 null_derivate = 1ULL << 32; + constexpr s64 null_derivative = 1ULL << 32; Surface src = regs.src; const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format)); const bool delegate_to_gpu = src.width > 512 && src.height > 512 && bytes_per_pixel <= 8 && @@ -89,7 +89,7 @@ void Fermi2D::Blit() { .operation = regs.operation, .filter = args.sample_mode.filter, .must_accelerate = - args.du_dx != null_derivate || args.dv_dy != null_derivate || delegate_to_gpu, + args.du_dx != null_derivative || args.dv_dy != null_derivative || delegate_to_gpu, .dst_x0 = args.dst_x0, .dst_y0 = args.dst_y0, .dst_x1 = args.dst_x0 + args.dst_width, -- cgit v1.2.3 From ae60a5657e8262472754e3b6583a5a1c59522f0e Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 18 Nov 2023 19:26:14 +0100 Subject: Buffer Cache: Eliminate clears on Indirect buffers --- src/video_core/buffer_cache/buffer_cache.h | 5 ----- src/video_core/engines/maxwell_3d.cpp | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 081a574e8..f5b10411b 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -1192,11 +1192,6 @@ void BufferCache

::UpdateDrawIndirect() { .size = static_cast(size), .buffer_id = FindBuffer(*cpu_addr, static_cast(size)), }; - VAddr cpu_addr_start = Common::AlignDown(*cpu_addr, 64); - VAddr cpu_addr_end = Common::AlignUp(*cpu_addr + size, 64); - IntervalType interval{cpu_addr_start, cpu_addr_end}; - ClearDownload(interval); - common_ranges.subtract(interval); }; if (current_draw_indirect->include_count) { update(current_draw_indirect->count_start_address, sizeof(u32), diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 32d767d85..592c28ba3 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -268,7 +268,7 @@ size_t Maxwell3D::EstimateIndexBufferSize() { std::numeric_limits::max()}; const size_t byte_size = regs.index_buffer.FormatSizeInBytes(); const size_t log2_byte_size = Common::Log2Ceil64(byte_size); - const size_t cap{GetMaxCurrentVertices() * 3 * byte_size}; + const size_t cap{GetMaxCurrentVertices() * 4 * byte_size}; const size_t lower_cap = std::min(static_cast(end_address - start_address), cap); return std::min( -- cgit v1.2.3 From a493ba76b455374b6a9cba40df75e328ad72bc0e Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 19 Nov 2023 14:29:06 +0100 Subject: Vulkan: Be more generous with pipeline workers for Android --- .../renderer_vulkan/vk_pipeline_cache.cpp | 23 +++++++++++++++++----- src/video_core/vulkan_common/vulkan_device.cpp | 6 ++++++ src/video_core/vulkan_common/vulkan_device.h | 7 +++++++ 3 files changed, 31 insertions(+), 5 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 22bf8cc77..16ad8d625 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -263,6 +263,22 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span program info.y_negate = key.state.y_negate != 0; return info; } + +size_t GetTotalPipelineWorkers() { + const size_t max_core_threads = + std::max(static_cast(std::thread::hardware_concurrency()), 2ULL) - 1ULL; +#ifdef ANDROID + // Leave at least a few cores free in android + constexpr size_t free_cores = 3ULL; + if (max_core_threads <= free_cores) { + return 1ULL; + } + return max_core_threads - free_cores; +#else + return max_core_threads; +#endif +} + } // Anonymous namespace size_t ComputePipelineCacheKey::Hash() const noexcept { @@ -294,11 +310,8 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device texture_cache{texture_cache_}, shader_notify{shader_notify_}, use_asynchronous_shaders{Settings::values.use_asynchronous_shaders.GetValue()}, use_vulkan_pipeline_cache{Settings::values.use_vulkan_driver_pipeline_cache.GetValue()}, -#ifdef ANDROID - workers(1, "VkPipelineBuilder"), -#else - workers(std::max(std::thread::hardware_concurrency(), 2U) - 1, "VkPipelineBuilder"), -#endif + workers(device.HasBrokenParallelShaderCompiling() ? 1ULL : GetTotalPipelineWorkers(), + "VkPipelineBuilder"), serialization_thread(1, "VkPipelineSerialization") { const auto& float_control{device.FloatControlProperties()}; const VkDriverId driver_id{device.GetDriverID()}; diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index e518756d2..6900b8ffa 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -635,6 +635,12 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR has_broken_cube_compatibility = true; } } + if (is_qualcomm) { + const u32 version = (properties.properties.driverVersion << 3) >> 3; + if (version < VK_MAKE_API_VERSION(0, 255, 615, 512)) { + has_broken_parallel_compiling = true; + } + } if (extensions.sampler_filter_minmax && is_amd) { // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken. if (!features.shader_float16_int8.shaderFloat16) { diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index b213ed7dd..355de0616 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -102,6 +102,7 @@ VK_DEFINE_HANDLE(VmaAllocator) EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME) \ + EXTENSION_NAME(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_4444_FORMATS_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_LINE_RASTERIZATION_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME) \ @@ -599,6 +600,11 @@ public: return has_broken_cube_compatibility; } + /// Returns true if parallel shader compiling has issues with the current driver. + bool HasBrokenParallelShaderCompiling() const { + return has_broken_parallel_compiling; + } + /// Returns the vendor name reported from Vulkan. std::string_view GetVendorName() const { return properties.driver.driverName; @@ -794,6 +800,7 @@ private: bool is_non_gpu{}; ///< Is SoftwareRasterizer, FPGA, non-GPU device. bool has_broken_compute{}; ///< Compute shaders can cause crashes bool has_broken_cube_compatibility{}; ///< Has broken cube compatibility bit + bool has_broken_parallel_compiling{}; ///< Has broken parallel shader compiling. bool has_renderdoc{}; ///< Has RenderDoc attached bool has_nsight_graphics{}; ///< Has Nsight Graphics attached bool supports_d24_depth{}; ///< Supports D24 depth buffers. -- cgit v1.2.3 From 473caaff5b02dc75404943dee6b12234995136d4 Mon Sep 17 00:00:00 2001 From: Liam Date: Sun, 19 Nov 2023 11:27:12 -0500 Subject: renderer_vulkan: ignore viewport stores on non-supporting drivers --- src/video_core/renderer_vulkan/vk_pipeline_cache.cpp | 1 + src/video_core/vulkan_common/vulkan_device.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 16ad8d625..89b455bff 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -351,6 +351,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device .support_geometry_shader_passthrough = device.IsNvGeometryShaderPassthroughSupported(), .support_native_ndc = device.IsExtDepthClipControlSupported(), .support_scaled_attributes = !device.MustEmulateScaledFormats(), + .support_multi_viewport = device.SupportsMultiViewport(), .warp_size_potentially_larger_than_guest = device.IsWarpSizePotentiallyBiggerThanGuest(), diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 355de0616..4f3846345 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -669,6 +669,10 @@ public: return supports_conditional_barriers; } + bool SupportsMultiViewport() const { + return features2.features.multiViewport; + } + [[nodiscard]] static constexpr bool CheckBrokenCompute(VkDriverId driver_id, u32 driver_version) { if (driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { -- cgit v1.2.3 From e7878e3cf812ce9e38e1fdee3735edc764f2bf81 Mon Sep 17 00:00:00 2001 From: Liam Date: Mon, 20 Nov 2023 20:09:12 -0500 Subject: vk_texture_cache: add workaround for nullDescriptor on Mali --- src/video_core/renderer_vulkan/vk_texture_cache.cpp | 18 ++++++++++++++++-- src/video_core/renderer_vulkan/vk_texture_cache.h | 1 + 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index de34f6d49..5dbec2e62 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1785,8 +1785,22 @@ ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, : VideoCommon::ImageViewBase{info, view_info, gpu_addr_}, buffer_size{VideoCommon::CalculateGuestSizeInBytes(info)} {} -ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams& params) - : VideoCommon::ImageViewBase{params} {} +ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageViewParams& params) + : VideoCommon::ImageViewBase{params}, device{&runtime.device} { + if (device->HasNullDescriptor()) { + return; + } + + // Handle fallback for devices without nullDescriptor + ImageInfo info{}; + info.format = PixelFormat::A8B8G8R8_UNORM; + + null_image = MakeImage(*device, runtime.memory_allocator, info, {}); + image_handle = *null_image; + for (u32 i = 0; i < Shader::NUM_TEXTURE_TYPES; i++) { + image_views[i] = MakeView(VK_FORMAT_A8B8G8R8_UNORM_PACK32, VK_IMAGE_ASPECT_COLOR_BIT); + } +} ImageView::~ImageView() = default; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 7a0807709..edf5d7635 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -267,6 +267,7 @@ private: vk::ImageView depth_view; vk::ImageView stencil_view; vk::ImageView color_view; + vk::Image null_image; VkImage image_handle = VK_NULL_HANDLE; VkImageView render_target = VK_NULL_HANDLE; VkSampleCountFlagBits samples = VK_SAMPLE_COUNT_1_BIT; -- cgit v1.2.3 From 453fd470307692b63c8395177fe0f911367466d9 Mon Sep 17 00:00:00 2001 From: Liam Date: Thu, 23 Nov 2023 09:33:02 -0500 Subject: query_cache: demote report synced unreachable to assert --- src/video_core/query_cache/query_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/video_core') diff --git a/src/video_core/query_cache/query_cache.h b/src/video_core/query_cache/query_cache.h index 78b42b518..efa9adf7a 100644 --- a/src/video_core/query_cache/query_cache.h +++ b/src/video_core/query_cache/query_cache.h @@ -266,7 +266,7 @@ void QueryCacheBase::CounterReport(GPUVAddr addr, QueryType counter_type return; } if (False(query_base->flags & QueryFlagBits::IsFinalValueSynced)) [[unlikely]] { - UNREACHABLE(); + ASSERT(false); return; } query_base->value += streamer->GetAmmendValue(); -- cgit v1.2.3 From 4a278b69b1393bc9b3766b7f9cb2eb7f47b3eb07 Mon Sep 17 00:00:00 2001 From: Liam Date: Fri, 24 Nov 2023 11:38:39 -0500 Subject: renderer_vulkan: exclude steam deck oled from force max clock setting --- src/video_core/vulkan_common/vulkan_device.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/video_core') diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 6900b8ffa..fde36a49c 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -869,7 +869,8 @@ bool Device::ShouldBoostClocks() const { driver_id == VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA || driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY || driver_id == VK_DRIVER_ID_MESA_TURNIP; - const bool is_steam_deck = vendor_id == 0x1002 && device_id == 0x163F; + const bool is_steam_deck = (vendor_id == 0x1002 && device_id == 0x163F) || + (vendor_id == 0x1002 && device_id == 0x1435); const bool is_debugging = this->HasDebuggingToolAttached(); -- cgit v1.2.3